浏览代码

Merge branch 'master' into develop-gym-wrapper

/develop/gym-wrapper
vincentpierre 4 年前
当前提交
6ddfe74f
共有 50 个文件被更改,包括 1203 次插入654 次删除
  1. 1
      .github/ISSUE_TEMPLATE/bug_report.md
  2. 23
      .pre-commit-config.yaml
  3. 11
      .yamato/com.unity.ml-agents-test.yml
  4. 22
      .yamato/gym-interface-test.yml
  5. 14
      .yamato/protobuf-generation-test.yml
  6. 21
      .yamato/python-ll-api-test.yml
  7. 20
      .yamato/standalone-build-test.yml
  8. 25
      .yamato/training-int-tests.yml
  9. 23
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
  10. 35
      README.md
  11. 3
      com.unity.ml-agents/CHANGELOG.md
  12. 20
      com.unity.ml-agents/Editor/BrainParametersDrawer.cs
  13. 18
      com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
  14. 13
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  15. 5
      docs/Learning-Environment-Create-New.md
  16. 3
      docs/Learning-Environment-Design-Agents.md
  17. 2
      docs/Learning-Environment-Executable.md
  18. 4
      docs/Migrating.md
  19. 22
      docs/Python-API.md
  20. 338
      docs/Training-ML-Agents.md
  21. 8
      docs/Using-Tensorboard.md
  22. 21
      gym-unity/gym_unity/envs/__init__.py
  23. 5
      gym-unity/gym_unity/tests/test_gym.py
  24. 50
      ml-agents-envs/mlagents_envs/base_env.py
  25. 293
      ml-agents-envs/mlagents_envs/environment.py
  26. 54
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  27. 118
      ml-agents-envs/mlagents_envs/tests/test_side_channel.py
  28. 14
      ml-agents/mlagents/trainers/learn.py
  29. 25
      ml-agents/mlagents/trainers/models.py
  30. 1
      ml-agents/mlagents/trainers/policy/tf_policy.py
  31. 34
      ml-agents/mlagents/trainers/ppo/optimizer.py
  32. 4
      ml-agents/mlagents/trainers/ppo/trainer.py
  33. 12
      ml-agents/mlagents/trainers/sac/optimizer.py
  34. 8
      ml-agents/mlagents/trainers/simple_env_manager.py
  35. 10
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  36. 12
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  37. 5
      ml-agents/mlagents/trainers/tests/test_learn.py
  38. 4
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  39. 4
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  40. 6
      ml-agents/tests/yamato/check_coverage_percent.py
  41. 4
      ml-agents/tests/yamato/scripts/run_llapi.py
  42. 21
      ml-agents/tests/yamato/training_int_tests.py
  43. 28
      ml-agents/tests/yamato/yamato_utils.py
  44. 6
      utils/validate_versions.py
  45. 37
      com.unity.ml-agents/Tests/Editor/Communicator/GrpcExtensionsTests.cs
  46. 95
      docs/Versioning.md
  47. 108
      ml-agents-envs/mlagents_envs/env_utils.py
  48. 81
      ml-agents-envs/mlagents_envs/side_channel/side_channel_manager.py
  49. 64
      ml-agents-envs/mlagents_envs/tests/test_env_utils.py
  50. 102
      ml-agents-envs/mlagents_envs/tests/test_steps.py

1
.github/ISSUE_TEMPLATE/bug_report.md


If applicable, add screenshots to help explain your problem.
**Environment (please complete the following information):**
- Unity Version: [e.g. Unity 2020.1f1]
- OS + version: [e.g. Windows 10]
- _ML-Agents version_: (e.g. ML-Agents v0.8, or latest `develop` branch from source)
- _TensorFlow version_: (you can run `pip3 show tensorflow` to get this)

23
.pre-commit-config.yaml


files: "gym-unity/.*"
args: [--ignore-missing-imports, --disallow-incomplete-defs]
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.1
hooks:
- id: flake8
exclude: >
(?x)^(
.*_pb2.py|
.*_pb2_grpc.py
)$
# flake8-tidy-imports is used for banned-modules, not actually tidying
additional_dependencies: [flake8-comprehensions==3.2.2, flake8-tidy-imports==4.1.0, flake8-bugbear==20.1.4]
rev: v2.4.0
rev: v2.5.0
hooks:
- id: mixed-line-ending
exclude: >

.*.meta
)$
args: [--fix=lf]
- id: flake8
exclude: >
(?x)^(
.*_pb2.py|
.*_pb2_grpc.py
)$
# flake8-tidy-imports is used for banned-modules, not actually tidying
additional_dependencies: [flake8-comprehensions==3.1.4, flake8-tidy-imports==4.0.0, flake8-bugbear==20.1.2]
- id: trailing-whitespace
name: trailing-whitespace-markdown
types: [markdown]

11
.yamato/com.unity.ml-agents-test.yml


triggers:
cancel_old_ci: true
{% if platform.name == "mac" %}
changes:
only:
- "com.unity.ml-agents/**"
- "ml-agents/tests/yamato/**"
- ".yamato/com.unity.ml-agents-test.yml"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "com.unity.ml-agents/**" OR
pull_request.changes.any match ".yamato/com.unity.ml-agents-test.yml")
{% endif %}
{% endfor %}
{% endfor %}

22
.yamato/gym-interface-test.yml


- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/gym-interface-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "com.unity.ml-agents/**" OR
pull_request.changes.any match "Project/**" OR
pull_request.changes.any match "ml-agents/**" OR
pull_request.changes.any match "ml-agents-envs/**" OR
pull_request.changes.any match "gym-unity/**" OR
pull_request.changes.any match ".yamato/gym-interface-test.yml") AND
NOT pull_request.changes.all match "**/*.md"
{% endfor %}

14
.yamato/protobuf-generation-test.yml


git diff -- :/ ":(exclude,top)$CS_PROTO_PATH/*.meta" > artifacts/proto.patch; exit $GIT_ERR; }
triggers:
cancel_old_ci: true
changes:
only:
- "protobuf-definitions/**"
- ".yamato/protobuf-generation-test.yml"
except:
- "protobuf-definitions/*.md"
- "protobuf-definitions/**/*.md"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "protobuf-definitions/**" OR
pull_request.changes.any match ".yamato/protobuf-generation-test.yml") AND
NOT pull_request.changes.all match "protobuf-definitions/**/*.md"
artifacts:
patch:
paths:

21
.yamato/python-ll-api-test.yml


- .yamato/standalone-build-test.yml#test_mac_standalone_{{ editor.version }}
triggers:
cancel_old_ci: true
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/python-ll-api-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "com.unity.ml-agents/**" OR
pull_request.changes.any match "Project/**" OR
pull_request.changes.any match "ml-agents/**" OR
pull_request.changes.any match "ml-agents-envs/**" OR
pull_request.changes.any match ".yamato/python-ll-api-test.yml") AND
NOT pull_request.changes.all match "**/*.md"
{% endfor %}

20
.yamato/standalone-build-test.yml


- pip install pyyaml
- python -u -m ml-agents.tests.yamato.standalone_build_tests
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Basic/Scenes/Basic.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/Bouncer/Scenes/Bouncer.unity
- python -u -m ml-agents.tests.yamato.standalone_build_tests --scene=Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- ".yamato/standalone-build-test.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "com.unity.ml-agents/**" OR
pull_request.changes.any match ".yamato/standalone-build-test.yml") AND
NOT pull_request.changes.all match "**/*.md"
artifacts:
logs:
paths:

25
.yamato/training-int-tests.yml


# Backwards-compatibility tests.
# If we make a breaking change to the communication protocol, these will need
# to be disabled until the next release.
# - python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
# - python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
- python -u -m ml-agents.tests.yamato.training_int_tests --python=0.16.0
- python -u -m ml-agents.tests.yamato.training_int_tests --csharp=1.0.0
changes:
only:
- "com.unity.ml-agents/**"
- "Project/**"
- "ml-agents/**"
- "ml-agents-envs/**"
- ".yamato/training-int-tests.yml"
except:
- "*.md"
- "com.unity.ml-agents/*.md"
- "com.unity.ml-agents/**/*.md"
expression: |
(pull_request.target eq "master" OR
pull_request.target match "release.+") AND
NOT pull_request.draft AND
(pull_request.changes.any match "com.unity.ml-agents/**" OR
pull_request.changes.any match "Project/**" OR
pull_request.changes.any match "ml-agents/**" OR
pull_request.changes.any match "ml-agents-envs/**" OR
pull_request.changes.any match ".yamato/training-int-tests.yml") AND
NOT pull_request.changes.all match "**/*.md"
artifacts:
logs:
paths:

23
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs


{
const string k_CommandLineModelOverrideFlag = "--mlagents-override-model";
const string k_CommandLineQuitAfterEpisodesFlag = "--mlagents-quit-after-episodes";
const string k_CommandLineQuitOnLoadFailure = "--mlagents-quit-on-load-failure";
// The attached Agent
Agent m_Agent;

int m_MaxEpisodes;
int m_NumSteps;
bool m_QuitOnLoadFailure;
/// <summary>
/// Get the asset path to use from the commandline arguments.

var maxEpisodes = 0;
var args = Environment.GetCommandLineArgs();
for (var i = 0; i < args.Length - 1; i++)
for (var i = 0; i < args.Length; i++)
{
if (args[i] == k_CommandLineModelOverrideFlag && i < args.Length-2)
{

}
else if (args[i] == k_CommandLineQuitAfterEpisodesFlag)
else if (args[i] == k_CommandLineQuitAfterEpisodesFlag && i < args.Length-1)
}
else if (args[i] == k_CommandLineQuitOnLoadFailure)
{
m_QuitOnLoadFailure = true;
}
}

var behaviorName = bp.BehaviorName;
var nnModel = GetModelForBehaviorName(behaviorName);
Debug.Log($"Overriding behavior {behaviorName} for agent with model {nnModel?.name}");
if (nnModel == null && m_QuitOnLoadFailure)
{
Debug.Log(
$"Didn't find a model for behaviorName {behaviorName}. Make " +
$"sure the behaviorName is set correctly in the commandline " +
$"and that the model file exists"
);
Application.Quit(1);
}
var modelName = nnModel != null ? nnModel.name : "<null>";
Debug.Log($"Overriding behavior {behaviorName} for agent with model {modelName}");
// This might give a null model; that's better because we'll fall back to the Heuristic
m_Agent.SetModel($"Override_{behaviorName}", nnModel);

35
README.md


## Releases & Documentation
**Our latest, stable release is `Release 1`. Click [here](docs/Readme.md) to
get started with the latest release of ML-Agents.**
**Our latest, stable release is `Release 1`. Click
[here](https://github.com/Unity-Technologies/ml-agents/tree/release_1/docs/Readme.md)
to get started with the latest release of ML-Agents.**
The table below lists all our releases, including our `master` branch which is under active
development and may be unstable. A few helpful guidelines:
* The docs links in the table below include installation and usage instructions specific to each
release. Remember to always use the documentation that corresponds to the release version you're
using.
* See the [GitHub releases](https://github.com/Unity-Technologies/ml-agents/releases) for more
details of the changes between versions.
* If you have used an earlier version of the ML-Agents Toolkit, we strongly recommend our
[guide on migrating from earlier versions](docs/Migrating.md).
The table below lists all our releases, including our `master` branch which is
under active development and may be unstable. A few helpful guidelines:
- The [Versioning page](docs/Versioning.md) overviews how we manage our GitHub
releases and the versioning process for each of the ML-Agents components.
- The [Releases page](https://github.com/Unity-Technologies/ml-agents/releases)
contains details of the changes between releases.
- The [Migration page](docs/Migrating.md) contains details on how to upgrade
from earlier releases of the ML-Agents Toolkit.
- The **Documentation** links in the table below include installation and usage
instructions specific to each release. Remember to always use the
documentation that corresponds to the release version you're using.
| **Version** | **Release Date** | **Source** | **Documentation** | **Download** |
|:-------:|:------:|:-------------:|:-------:|:------------:|

If you use Unity or the ML-Agents Toolkit to conduct research, we ask that you
cite the following paper as a reference:
Juliani, A., Berges, V., Vckay, E., Gao, Y., Henry, H., Mattar, M., Lange, D.
(2018). Unity: A General Platform for Intelligent Agents. _arXiv preprint
arXiv:1809.02627._ https://github.com/Unity-Technologies/ml-agents.
Juliani, A., Berges, V., Teng, E., Cohen, A., Harper, J., Elion, C., Goy, C.,
Gao, Y., Henry, H., Mattar, M., Lange, D. (2020). Unity: A General Platform for
Intelligent Agents. _arXiv preprint
[arXiv:1809.02627](https://arxiv.org/abs/1809.02627)._
https://github.com/Unity-Technologies/ml-agents.
- (May 12, 2020)
[Announcing ML-Agents Unity Package v1.0!](https://blogs.unity3d.com/2020/05/12/announcing-ml-agents-unity-package-v1-0/)
- (February 28, 2020)
[Training intelligent adversaries using self-play with ML-Agents](https://blogs.unity3d.com/2020/02/28/training-intelligent-adversaries-using-self-play-with-ml-agents/)
- (November 11, 2019)

3
com.unity.ml-agents/CHANGELOG.md


#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
- `beta` and `epsilon` in `PPO` are no longer decayed by default but follow the same schedule as learning rate. (#3940)
- `get_behavior_names()` and `get_behavior_spec()` on UnityEnvironment were replaced by the `behavior_specs` property. (#3946)
### Minor Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)

- Unity Player logs are now written out to the results directory. (#3877)
- Run configuration YAML files are written out to the results directory at the end of the run. (#3815)
### Bug Fixes
- An issue was fixed where using `--initialize-from` would resume from the past step count. (#3962)
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)

20
com.unity.ml-agents/Editor/BrainParametersDrawer.cs


static void DrawContinuousVectorAction(Rect position, SerializedProperty property)
{
var vecActionSize = property.FindPropertyRelative(k_ActionSizePropName);
vecActionSize.arraySize = 1;
// This check is here due to:
// https://fogbugz.unity3d.com/f/cases/1246524/
// If this case has been resolved, please remove this if condition.
if (vecActionSize.arraySize != 1)
{
vecActionSize.arraySize = 1;
}
var continuousActionSize =
vecActionSize.GetArrayElementAtIndex(0);
EditorGUI.PropertyField(

static void DrawDiscreteVectorAction(Rect position, SerializedProperty property)
{
var vecActionSize = property.FindPropertyRelative(k_ActionSizePropName);
vecActionSize.arraySize = EditorGUI.IntField(
var newSize = EditorGUI.IntField(
// This check is here due to:
// https://fogbugz.unity3d.com/f/cases/1246524/
// If this case has been resolved, please remove this if condition.
if (newSize != vecActionSize.arraySize)
{
vecActionSize.arraySize = newSize;
}
position.y += k_LineHeight;
position.x += 20;
position.width -= 20;

18
com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs


{
var agentInfoProto = ai.ToAgentInfoProto();
var agentActionProto = new AgentActionProto
var agentActionProto = new AgentActionProto();
if(ai.storedVectorActions != null)
VectorActions = { ai.storedVectorActions }
};
agentActionProto.VectorActions.AddRange(ai.storedVectorActions);
}
return new AgentInfoActionPairProto
{

var brainParametersProto = new BrainParametersProto
{
VectorActionSize = { bp.VectorActionSize },
VectorActionSpaceType =
(SpaceTypeProto)bp.VectorActionSpaceType,
VectorActionSpaceType = (SpaceTypeProto) bp.VectorActionSpaceType,
brainParametersProto.VectorActionDescriptions.AddRange(bp.VectorActionDescriptions);
if(bp.VectorActionDescriptions != null)
{
brainParametersProto.VectorActionDescriptions.AddRange(bp.VectorActionDescriptions);
}
return brainParametersProto;
}

/// </summary>
public static DemonstrationMetaProto ToProto(this DemonstrationMetaData dm)
{
var demonstrationName = dm.demonstrationName ?? "";
var demoProto = new DemonstrationMetaProto
{
ApiVersion = DemonstrationMetaData.ApiVersion,

DemonstrationName = dm.demonstrationName
DemonstrationName = demonstrationName
};
return demoProto;
}

13
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


{
public Action OnRequestDecision;
ObservationWriter m_ObsWriter = new ObservationWriter();
public void RequestDecision(AgentInfo info, List<ISensor> sensors) {
foreach(var sensor in sensors){
public void RequestDecision(AgentInfo info, List<ISensor> sensors)
{
foreach (var sensor in sensors)
{
sensor.GetObservationProto(m_ObsWriter);
}
OnRequestDecision?.Invoke();

agent1.SetPolicy(policy);
StackingSensor sensor = null;
foreach(ISensor s in agent1.sensors){
if (s is StackingSensor){
foreach (ISensor s in agent1.sensors)
{
if (s is StackingSensor)
{
sensor = s as StackingSensor;
}
}

{
agent1.RequestDecision();
aca.EnvironmentStep();
}
policy.OnRequestDecision = () => SensorTestHelper.CompareObservation(sensor, new[] {18f, 19f, 21f});

5
docs/Learning-Environment-Create-New.md


learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e4
memory_size: 128
normalize: false
num_epoch: 3
num_layers: 2

reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
strength: 1.0
gamma: 0.99
```
Since this example creates a very simple training environment with only a few

3
docs/Learning-Environment-Design-Agents.md


0, rays will be used instead of spheres. Rays may be more efficient,
especially in complex scenes.
- _Ray Length_ The length of the casts
- _Ray Layer Mask_ The [LayerMask](https://docs.unity3d.com/ScriptReference/LayerMask.html)
passed to the raycast or spherecast. This can be used to ignore certain types
of objects when casting.
- _Observation Stacks_ The number of previous results to "stack" with the cast
results. Note that this can be independent of the "Stacked Vectors" setting in
`Behavior Parameters`.

2
docs/Learning-Environment-Executable.md


the directory where you installed the ML-Agents Toolkit, run:
```sh
mlagents-learn ../config/ppo/3DBall.yaml --env=3DBall --run-id=firstRun
mlagents-learn config/ppo/3DBall.yaml --env=3DBall --run-id=firstRun
```
And you should see something like

4
docs/Migrating.md


- Trainer configuration, curriculum configuration, and parameter randomization
configuration have all been moved to a single YAML file. (#3791)
- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
- On the UnityEnvironment API, `get_behavior_names()` and `get_behavior_specs()` methods were combined into the property `behavior_specs` that contains a mapping from behavior names to behavior spec.
### Steps to Migrate
- Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into

the contents of the sampler config to `parameter_randomization` in the main trainer configuration.
- If you are using `UnityEnvironment` directly, replace `max_step` with `interrupted`
in the `TerminalStep` and `TerminalSteps` objects.
- Replace usage of `get_behavior_names()` and `get_behavior_specs()` in UnityEnvironment with `behavior_specs`.
## Migrating from 0.15 to Release 1

data in the new MonoBehaviour instead.
- If the class overrode the virtual methods, create a new MonoBehaviour and
move the logic to it:
- Move the InitializeAcademy code to MonoBehaviour.OnAwake
- Move the InitializeAcademy code to MonoBehaviour.Awake
- Move the AcademyStep code to MonoBehaviour.FixedUpdate
- Move the OnDestroy code to MonoBehaviour.OnDestroy.
- Move the AcademyReset code to a new method and add it to the

22
docs/Python-API.md


```python
from mlagents_envs.environment import UnityEnvironment
# This is a non-blocking call that only loads the environment.
# Start interacting with the evironment.
env.reset()
behavior_names = env.behavior_spec.keys()
...
**NOTE:** Please read [Interacting with a Unity Environment](#interacting-with-a-unity-environment)
to read more about how you can interact with the Unity environment from Python.
- `file_name` is the name of the environment binary (located in the root
directory of the python project).

act.
- **Close : `env.close()`** Sends a shutdown signal to the environment and
terminates the communication.
- **Get Behavior Names : `env.get_behavior_names()`** Returns a list of
`BehaviorName`. Note that the number of groups can change over time in the
simulation if new Agent behaviors are created in the simulation.
- **Get Behavior Spec : `env.get_behavior_spec(behavior_name: str)`** Returns
the `BehaviorSpec` corresponding to the behavior_name given as input. A
`BehaviorSpec` contains information such as the observation shapes, the action
type (multi-discrete or continuous) and the action shape. Note that the
`BehaviorSpec` for a specific group is fixed throughout the simulation.
- **Behavior Specs : `env.behavior_specs`** Returns a Mapping of
`BehaviorName` to `BehaviorSpec` objects (read only).
A `BehaviorSpec` contains information such as the observation shapes, the
action type (multi-discrete or continuous) and the action shape. Note that
the `BehaviorSpec` for a specific group is fixed throughout the simulation.
The number of entries in the Mapping can change over time in the simulation
if new Agent behaviors are created in the simulation.
- **Get Steps : `env.get_steps(behavior_name: str)`** Returns a tuple
`DecisionSteps, TerminalSteps` corresponding to the behavior_name given as
input. The `DecisionSteps` contains information about the state of the agents

338
docs/Training-ML-Agents.md


#### Observing Training
Regardless of which training methods, configurations or hyperparameters you
provide, the training process will always generate three artifacts:
provide, the training process will always generate three artifacts, all found
in the `results/<run-identifier>` folder:
1. Summaries (under the `summaries/` folder): these are training metrics that
1. Summaries: these are training metrics that
1. Models (under the `models/` folder): these contain the model checkpoints that
1. Models: these contain the model checkpoints that
1. Timers file (also under the `summaries/` folder): this contains aggregated
1. Timers file (under `results/<run-identifier>/run_logs`): this contains aggregated
metrics on your training process, including time spent on specific code
blocks. See [Profiling in Python](Profiling-Python.md) for more information
on the timers generated.

This section offers a detailed guide into how to manage the different training
set-ups withing the toolkit.
More specifically, this section offers a detailed guide on four command-line
More specifically, this section offers a detailed guide on the command-line
Behavior in the scene
- `--curriculum`: defines the set-up for Curriculum Learning
- `--sampler`: defines the set-up for Environment Parameter Randomization
Behavior in the scene, and the set-ups for Curriculum Learning and
Environment Parameter Randomization
- `--num-envs`: number of concurrent Unity instances to use during training
Reminder that a detailed description of all command-line options can be found by

process when the default parameters don't seem to be giving the level of
performance you would like. We provide sample configuration files for our
example environments in the [config/](../config/) directory. The
`config/trainer_config.yaml` was used to train the 3D Balance Ball in the
`config/ppo/3DBall.yaml` was used to train the 3D Balance Ball in the
[Getting Started](Getting-Started.md) guide. That configuration file uses the
PPO trainer, but we also have configuration files for SAC and GAIL.

add typically has its own training configurations or additional configuration
files. For instance:
add typically has its own training configurations. For instance:
- Use PPO or SAC?
- Use Recurrent Neural Networks for adding memory to your agents?

demonstrations.)
- Use self-play? (Assuming your environment includes multiple agents.)
The answers to the above questions will dictate the configuration files and the
parameters within them. The rest of this section breaks down the different
configuration files and explains the possible settings for each.
The trainer config file, `<trainer-config-file>`, determines the features you will
use during training, and the answers to the above questions will dictate its contents.
The rest of this guide breaks down the different sub-sections of the trainer config file
and explains the possible settings for each.
### Trainer Config File
### Behavior Configurations
We begin with the trainer config file, `<trainer-config-file>`, which includes a
set of configurations for each Behavior in your scene. Some of the
The primary section of the trainer config file is a
set of configurations for each Behavior in your scene. These are defined under
the sub-section `behaviors` in your trainer config file. Some of the
curriculum and environment parameter randomization settings are not part of this
file, but their settings live in different files that we'll cover in subsequent
sections.
curriculum and environment parameter randomization settings are not part of the `behaviors`
configuration, but their settings live in different sections that we'll cover subsequently.
BehaviorPPO:
trainer: ppo
behaviors:
BehaviorPPO:
trainer: ppo
# Trainer configs common to PPO/SAC (excluding reward signals)
batch_size: 1024
buffer_size: 10240
hidden_units: 128
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e5
normalize: false
num_layers: 2
time_horizon: 64
vis_encoder_type: simple
# Trainer configs common to PPO/SAC (excluding reward signals)
batch_size: 1024
buffer_size: 10240
hidden_units: 128
learning_rate: 3.0e-4
learning_rate_schedule: linear
max_steps: 5.0e5
normalize: false
num_layers: 2
time_horizon: 64
vis_encoder_type: simple
# PPO-specific configs
beta: 5.0e-3
epsilon: 0.2
lambd: 0.95
num_epoch: 3
threaded: true
# PPO-specific configs
beta: 5.0e-3
epsilon: 0.2
lambd: 0.95
num_epoch: 3
threaded: true
# memory
use_recurrent: true
sequence_length: 64
memory_size: 256
# memory
use_recurrent: true
sequence_length: 64
memory_size: 256
# behavior cloning
behavioral_cloning:
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
strength: 0.5
steps: 150000
batch_size: 512
num_epoch: 3
samples_per_update: 0
init_path:
# behavior cloning
behavioral_cloning:
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
strength: 0.5
steps: 150000
batch_size: 512
num_epoch: 3
samples_per_update: 0
init_path:
reward_signals:
# environment reward
extrinsic:
strength: 1.0
gamma: 0.99
reward_signals:
# environment reward
extrinsic:
strength: 1.0
gamma: 0.99
# curiosity module
curiosity:
strength: 0.02
gamma: 0.99
encoding_size: 256
learning_rate: 3e-4
# curiosity module
curiosity:
strength: 0.02
gamma: 0.99
encoding_size: 256
learning_rate: 3e-4
# GAIL
gail:
strength: 0.01
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
learning_rate: 3e-4
use_actions: false
use_vail: false
# GAIL
gail:
strength: 0.01
gamma: 0.99
encoding_size: 128
demo_path: Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
learning_rate: 3e-4
use_actions: false
use_vail: false
# self-play
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 50000
team_change: 100000
# self-play
self_play:
window: 10
play_against_latest_model_ratio: 0.5
save_steps: 50000
swap_steps: 50000
team_change: 100000
```
Here is an equivalent file if we use an SAC trainer instead. Notice that the

```yaml
BehaviorSAC:
trainer: sac
# Trainer configs common to PPO/SAC (excluding reward signals)
# same as PPO config
behaviors:
BehaviorSAC:
trainer: sac
# SAC-specific configs (replaces the "PPO-specific configs" section above)
buffer_init_steps: 0
tau: 0.005
steps_per_update: 1
train_interval: 1
init_entcoef: 1.0
save_replay_buffer: false
# Trainer configs common to PPO/SAC (excluding reward signals)
# same as PPO config
# memory
# same as PPO config
# SAC-specific configs (replaces the "PPO-specific configs" section above)
buffer_init_steps: 0
tau: 0.005
steps_per_update: 1
train_interval: 1
init_entcoef: 1.0
save_replay_buffer: false
# pre-training using behavior cloning
behavioral_cloning:
# memory
reward_signals:
reward_signal_num_update: 1 # only applies to SAC
# pre-training using behavior cloning
behavioral_cloning:
# same as PPO config
reward_signals:
reward_signal_num_update: 1 # only applies to SAC
# environment reward
extrinsic:
# same as PPO config
# environment reward
extrinsic:
# same as PPO config
# curiosity module
curiosity:
# same as PPO config
# curiosity module
curiosity:
# same as PPO config
# GAIL
gail:
# same as PPO config
# GAIL
gail:
# self-play
self_play:
# self-play
self_play:
# same as PPO config
```
We now break apart the components of the configuration file and describe what

### Curriculum Learning
To enable curriculum learning, you need to provide the `--curriculum` CLI option
and point to a YAML file that defines the curriculum. Here is one example file:
To enable curriculum learning, you need to add a sub-section to the corresponding
`behaivors` entry in the trainer config YAML file that defines the curriculum for that
behavior. Here is one example:
BehaviorY:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
wall_height: [1.5, 2.0, 2.5, 4.0]
behaviors:
BehaviorY:
# < Same as above >
# Add this section
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
wall_height: [1.5, 2.0, 2.5, 4.0]
```
Each group of Agents under the same `Behavior Name` in an environment can have a

In order to define the curricula, the first step is to decide which parameters
of the environment will vary. In the case of the Wall Jump environment, the
height of the wall is what varies. Rather than adjusting it by hand, we will
create a YAML file which describes the structure of the curricula. Within it, we
create a configuration which describes the structure of the curricula. Within it, we
can specify which points in the training process our wall height will change,
either based on the percentage of training steps which have taken place, or what
the average reward the agent has received in the recent past is. Below is an

BigWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
behaviors:
BigWallJump:
# < Trainer parameters for BigWallJump >
# Curriculum configuration
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
big_wall_min_height: [0.0, 4.0, 6.0, 8.0]
big_wall_max_height: [4.0, 7.0, 8.0, 8.0]
SmallWallJump:
# < Trainer parameters for BigWallJump >
# Curriculum configuration
curriculum:
measure: progress
thresholds: [0.1, 0.3, 0.5]
min_lesson_length: 100
signal_smoothing: true
parameters:
small_wall_height: [1.5, 2.0, 2.5, 4.0]
```
The curriculum for each Behavior has the following parameters:

#### Training with a Curriculum
Once we have specified our metacurriculum and curricula, we can launch
`mlagents-learn` using the `–curriculum` flag to point to the config file for
`mlagents-learn` using the config file for
mlagents-learn config/trainer_config.yaml --curriculum=config/curricula/wall_jump.yaml --run-id=wall-jump-curriculum
mlagents-learn config/ppo/WallJump_curriculum.yaml --run-id=wall-jump-curriculum
```
We can then keep track of the current lessons and progresses via TensorBoard.

### Environment Parameter Randomization
To enable parameter randomization, you need to provide the `--sampler` CLI
option and point to a YAML file that defines the curriculum. Here is one example
file:
To enable parameter randomization, you need to add a `parameter-randomization` sub-section
to your trainer config YAML file. Here is one example:
resampling-interval: 5000
behaviors:
# < Same as above>
mass:
sampler-type: "uniform"
min_value: 0.5
max_value: 10
parameter_randomization:
resampling-interval: 5000
gravity:
sampler-type: "multirange_uniform"
intervals: [[7, 10], [15, 20]]
mass:
sampler-type: "uniform"
min_value: 0.5
max_value: 10
scale:
sampler-type: "uniform"
min_value: 0.75
max_value: 3
gravity:
sampler-type: "multirange_uniform"
intervals: [[7, 10], [15, 20]]
scale:
sampler-type: "uniform"
min_value: 0.75
max_value: 3
```
Note that `mass`, `gravity` and `scale` are the names of the environment

`interval_2_max`], ...]
- **sub-arguments** - `intervals`
The implementation of the samplers can be found at
`ml-agents-envs/mlagents_envs/sampler_class.py`.
The implementation of the samplers can be found in the
[sampler_class.py file](../ml-agents/mlagents/trainers/sampler_class.py).
#### Defining a New Sampler Type

#### Training with Environment Parameter Randomization
After the sampler YAML file is defined, we proceed by launching `mlagents-learn`
and specify our configured sampler file with the `--sampler` flag. For example,
After the sampler configuration is defined, we proceed by launching `mlagents-learn`
and specify trainer configuration with `parameter-randomization` defined. For example,
`Environment Parameters` with `config/3dball_randomize.yaml` sampling setup, we
would run
`Environment Parameters` with sampling setup, we would run
mlagents-learn config/trainer_config.yaml --sampler=config/3dball_randomize.yaml
--run-id=3D-Ball-randomize
mlagents-learn config/ppo/3DBall_randomize.yaml --run-id=3D-Ball-randomize
```
We can observe progress and metrics via Tensorboard.

- **Buffer Size** - If you are having trouble getting an agent to train, even
with multiple concurrent Unity instances, you could increase `buffer_size` in
the `config/trainer_config.yaml` file. A common practice is to multiply
the trainer config file. A common practice is to multiply
`buffer_size` by `num-envs`.
- **Resource Constraints** - Invoking concurrent Unity instances is constrained
by the resources on the machine. Please use discretion when setting

8
docs/Using-Tensorboard.md


[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard).
The `mlagents-learn` command saves training statistics to a folder named
`summaries`, organized by the `run-id` value you assign to a training session.
`results`, organized by the `run-id` value you assign to a training session.
In order to observe the training process, either during training or afterward,
start TensorBoard:

the --port option.
**Note:** If you don't assign a `run-id` identifier, `mlagents-learn` uses the
default string, "ppo". All the statistics will be saved to the same sub-folder
and displayed as one session in TensorBoard. After a few runs, the displays can
become difficult to interpret in this situation. You can delete the folders
under the `summaries` directory to clear out old statistics.
default string, "ppo". You can delete the folders under the `results` directory
to clear out old statistics.
On the left side of the TensorBoard window, you can select which of the training
runs you want to display. You can select multiple run-ids to compare statistics.

21
gym-unity/gym_unity/envs/__init__.py


self._env = unity_env
# Take a single step so that the brain information will be sent over
if not self._env.get_behavior_names():
if not self._env.behavior_specs:
self._n_agents = -1
# Save the step result from the last time all Agents requested decisions.
self._previous_decision_step: DecisionSteps = None

self._allow_multiple_visual_obs = allow_multiple_visual_obs
# Check brain configuration
if len(self._env.get_behavior_names()) != 1:
if len(self._env.behavior_specs) != 1:
self.name = self._env.get_behavior_names()[0]
self.group_spec = self._env.get_behavior_spec(self.name)
self.name = list(self._env.behavior_specs.keys())[0]
self.group_spec = self._env.behavior_specs[self.name]
if use_visual and self._get_n_vis_obs() == 0:
raise UnityGymException(

self._env.step()
decision_step, terminal_step = self._env.get_steps(self.name)
self._check_agents(max(len(decision_step), len(terminal_step)))
if len(terminal_step) != 0:
# The agent is done
self.game_over = True

logger.warning("Could not seed environment %s", self.name)
return
def _check_agents(self, n_agents: int) -> None:
if self._n_agents > 1:
@staticmethod
def _check_agents(n_agents: int) -> None:
if n_agents > 1:
"There can only be one Agent in the environment but {n_agents} were detected."
f"There can only be one Agent in the environment but {n_agents} were detected."
)
@property

@property
def observation_space(self):
return self._observation_space
@property
def number_agents(self):
return self._n_agents
class ActionFlattener:

5
gym-unity/gym_unity/tests/test_gym.py


ActionType,
DecisionSteps,
TerminalSteps,
BehaviorMapping,
)

setup_mock_unityenvironment(
mock_env, mock_spec, mock_decision_step, mock_terminal_step
)
env = UnityToGymWrapper(mock_env, use_visual=False)
assert isinstance(env, UnityToGymWrapper)
assert isinstance(env.reset(), np.ndarray)

:Mock mock_decision: A DecisionSteps object that will be returned at each step and reset.
:Mock mock_termination: A TerminationSteps object that will be returned at each step and reset.
"""
mock_env.get_behavior_names.return_value = ["MockBrain"]
mock_env.get_behavior_spec.return_value = mock_spec
mock_env.behavior_specs = BehaviorMapping({"MockBrain": mock_spec})
mock_env.get_steps.return_value = (mock_decision, mock_termination)

50
ml-agents-envs/mlagents_envs/base_env.py


from abc import ABC, abstractmethod
from collections.abc import Mapping
from typing import List, NamedTuple, Tuple, Optional, Union, Dict, Iterator, Any
from typing import (
List,
NamedTuple,
Tuple,
Optional,
Union,
Dict,
Iterator,
Any,
Mapping as MappingType,
)
import numpy as np
from enum import Enum

return np.zeros((n_agents, self.action_size), dtype=np.float32)
class BehaviorMapping(Mapping):
def __init__(self, specs: Dict[BehaviorName, BehaviorSpec]):
self._dict = specs
def __len__(self) -> int:
return len(self._dict)
def __getitem__(self, behavior: BehaviorName) -> BehaviorSpec:
return self._dict[behavior]
def __iter__(self) -> Iterator[Any]:
yield from self._dict
class BaseEnv(ABC):
@abstractmethod
def step(self) -> None:

"""
pass
@abstractmethod
def reset(self) -> None:

pass
@abstractmethod
def close(self) -> None:

pass
@property
def get_behavior_names(self) -> List[BehaviorName]:
def behavior_specs(self) -> MappingType[str, BehaviorSpec]:
Returns the list of the behavior names present in the environment.
Returns a Mapping from behavior names to behavior specs.
This list can grow with time as new policies are instantiated.
:return: the list of agent BehaviorName.
Note that new keys can be added to this mapping as new policies are instantiated.
pass
@abstractmethod
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:

:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
"""
pass
@abstractmethod
def set_action_for_agent(

:param action: A one dimensional np.ndarray corresponding to the action
(either int or float)
"""
pass
@abstractmethod
def get_steps(

rewards, agent ids and interrupted flags of the agents that had their
episode terminated last step.
"""
pass
@abstractmethod
def get_behavior_spec(self, behavior_name: BehaviorName) -> BehaviorSpec:
"""
Get the BehaviorSpec corresponding to the behavior name
:param behavior_name: The name of the behavior the agents are part of
:return: A BehaviorSpec corresponding to that behavior
"""
pass

293
ml-agents-envs/mlagents_envs/environment.py


import atexit
from distutils.version import StrictVersion
import glob
import uuid
from typing import Dict, List, Optional, Any, Tuple
from typing import Dict, List, Optional, Tuple, Mapping as MappingType
from mlagents_envs.side_channel.side_channel import SideChannel, IncomingMessage
from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents_envs.side_channel.side_channel_manager import SideChannelManager
from mlagents_envs import env_utils
from mlagents_envs.base_env import (
BaseEnv,

BehaviorName,
AgentId,
BehaviorMapping,
)
from mlagents_envs.timers import timed, hierarchical_timer
from mlagents_envs.exception import (

from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto
from .rpc_communicator import RpcCommunicator
from sys import platform
import struct
SCALAR_ACTION_TYPES = (int, np.int32, np.int64, float, np.float32, np.float64)
SINGLE_BRAIN_ACTION_TYPES = SCALAR_ACTION_TYPES + (list, np.ndarray)
# Communication protocol version.
# When connecting to C#, this must be compatible with Academy.k_ApiVersion.
# We follow semantic versioning on the communication version, so existing

BASE_ENVIRONMENT_PORT = 5005
# Command line argument used to pass the port to the executable environment.
PORT_COMMAND_LINE_ARG = "--mlagents-port"
_PORT_COMMAND_LINE_ARG = "--mlagents-port"
@staticmethod
def _raise_version_exception(unity_com_ver: str) -> None:

)
@staticmethod
def check_communication_compatibility(
def _check_communication_compatibility(
unity_com_ver: str, python_api_version: str, unity_package_version: str
) -> bool:
unity_communicator_version = StrictVersion(unity_com_ver)

return True
@staticmethod
def get_capabilities_proto() -> UnityRLCapabilitiesProto:
def _get_capabilities_proto() -> UnityRLCapabilitiesProto:
def warn_csharp_base_capabitlities(
def _warn_csharp_base_capabilities(
caps: UnityRLCapabilitiesProto, unity_package_ver: str, python_package_ver: str
) -> None:
if not caps.baseRLCapabilities:

:str log_folder: Optional folder to write the Unity Player log file into. Requires absolute path.
"""
atexit.register(self._close)
self.additional_args = additional_args or []
self.no_graphics = no_graphics
self._additional_args = additional_args or []
self._no_graphics = no_graphics
# If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
# an environment, otherwise DEFAULT_EDITOR_PORT
if base_port is None:

self.port = base_port + worker_id
self._port = base_port + worker_id
self.proc1 = None
self.timeout_wait: int = timeout_wait
self.communicator = self.get_communicator(worker_id, base_port, timeout_wait)
self.worker_id = worker_id
self.side_channels: Dict[uuid.UUID, SideChannel] = {}
if side_channels is not None:
for _sc in side_channels:
if _sc.channel_id in self.side_channels:
raise UnityEnvironmentException(
"There cannot be two side channels with the same channel id {0}.".format(
_sc.channel_id
)
)
self.side_channels[_sc.channel_id] = _sc
self.log_folder = log_folder
self._proc1 = None
self._timeout_wait: int = timeout_wait
self._communicator = self._get_communicator(worker_id, base_port, timeout_wait)
self._worker_id = worker_id
self._side_channel_manager = SideChannelManager(side_channels)
self._log_folder = log_folder
# If the environment name is None, a new environment will not be launched
# and the communicator will directly try to connect to an existing unity environment.

"the worker-id must be 0 in order to connect with the Editor."
)
if file_name is not None:
self.executable_launcher(file_name, no_graphics, additional_args)
try:
self._proc1 = env_utils.launch_executable(
file_name, self._executable_args()
)
except UnityEnvironmentException:
self._close(0)
raise
f"Listening on port {self.port}. "
f"Listening on port {self._port}. "
f"Start training by pressing the Play button in the Unity Editor."
)
self._loaded = True

communication_version=self.API_VERSION,
package_version=mlagents_envs.__version__,
capabilities=UnityEnvironment.get_capabilities_proto(),
capabilities=UnityEnvironment._get_capabilities_proto(),
aca_output = self.send_academy_parameters(rl_init_parameters_in)
aca_output = self._send_academy_parameters(rl_init_parameters_in)
if not UnityEnvironment.check_communication_compatibility(
if not UnityEnvironment._check_communication_compatibility(
aca_params.communication_version,
UnityEnvironment.API_VERSION,
aca_params.package_version,

UnityEnvironment.warn_csharp_base_capabitlities(
UnityEnvironment._warn_csharp_base_capabilities(
aca_params.capabilities,
aca_params.package_version,
UnityEnvironment.API_VERSION,

self._update_behavior_specs(aca_output)
@staticmethod
def get_communicator(worker_id, base_port, timeout_wait):
def _get_communicator(worker_id, base_port, timeout_wait):
@staticmethod
def validate_environment_path(env_path: str) -> Optional[str]:
# Strip out executable extensions if passed
env_path = (
env_path.strip()
.replace(".app", "")
.replace(".exe", "")
.replace(".x86_64", "")
.replace(".x86", "")
)
true_filename = os.path.basename(os.path.normpath(env_path))
logger.debug("The true file name is {}".format(true_filename))
if not (glob.glob(env_path) or glob.glob(env_path + ".*")):
return None
cwd = os.getcwd()
launch_string = None
true_filename = os.path.basename(os.path.normpath(env_path))
if platform == "linux" or platform == "linux2":
candidates = glob.glob(os.path.join(cwd, env_path) + ".x86_64")
if len(candidates) == 0:
candidates = glob.glob(os.path.join(cwd, env_path) + ".x86")
if len(candidates) == 0:
candidates = glob.glob(env_path + ".x86_64")
if len(candidates) == 0:
candidates = glob.glob(env_path + ".x86")
if len(candidates) > 0:
launch_string = candidates[0]
elif platform == "darwin":
candidates = glob.glob(
os.path.join(cwd, env_path + ".app", "Contents", "MacOS", true_filename)
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(env_path + ".app", "Contents", "MacOS", true_filename)
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(cwd, env_path + ".app", "Contents", "MacOS", "*")
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(env_path + ".app", "Contents", "MacOS", "*")
)
if len(candidates) > 0:
launch_string = candidates[0]
elif platform == "win32":
candidates = glob.glob(os.path.join(cwd, env_path + ".exe"))
if len(candidates) == 0:
candidates = glob.glob(env_path + ".exe")
if len(candidates) > 0:
launch_string = candidates[0]
return launch_string
def executable_args(self) -> List[str]:
def _executable_args(self) -> List[str]:
if self.no_graphics:
if self._no_graphics:
args += [UnityEnvironment.PORT_COMMAND_LINE_ARG, str(self.port)]
if self.log_folder:
args += [UnityEnvironment._PORT_COMMAND_LINE_ARG, str(self._port)]
if self._log_folder:
self.log_folder, f"Player-{self.worker_id}.log"
self._log_folder, f"Player-{self._worker_id}.log"
args += self.additional_args
args += self._additional_args
def executable_launcher(self, file_name, no_graphics, args):
launch_string = self.validate_environment_path(file_name)
if launch_string is None:
self._close(0)
raise UnityEnvironmentException(
f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
)
else:
logger.debug("This is the launch string {}".format(launch_string))
# Launch Unity environment
subprocess_args = [launch_string] + self.executable_args()
try:
self.proc1 = subprocess.Popen(
subprocess_args,
# start_new_session=True means that signals to the parent python process
# (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
# This is generally good since we want the environment to have a chance to shutdown,
# but may be undesirable in come cases; if so, we'll add a command-line toggle.
# Note that on Windows, the CTRL_C signal will still be sent.
start_new_session=True,
)
except PermissionError as perm:
# This is likely due to missing read or execute permissions on file.
raise UnityEnvironmentException(
f"Error when trying to launch environment - make sure "
f"permissions are set correctly. For example "
f'"chmod -R 755 {launch_string}"'
) from perm
def _update_behavior_specs(self, output: UnityOutputProto) -> None:
init_output = output.rl_initialization_output
for brain_param in init_output.brain_parameters:

DecisionSteps.empty(self._env_specs[brain_name]),
TerminalSteps.empty(self._env_specs[brain_name]),
)
self._parse_side_channel_message(self.side_channels, output.side_channel)
self._side_channel_manager.process_side_channel_message(output.side_channel)
outputs = self.communicator.exchange(self._generate_reset_input())
outputs = self._communicator.exchange(self._generate_reset_input())
if outputs is None:
raise UnityCommunicatorStoppedException("Communicator has exited.")
self._update_behavior_specs(outputs)

].create_empty_action(n_agents)
step_input = self._generate_step_input(self._env_actions)
with hierarchical_timer("communicator.exchange"):
outputs = self.communicator.exchange(step_input)
outputs = self._communicator.exchange(step_input)
if outputs is None:
raise UnityCommunicatorStoppedException("Communicator has exited.")
self._update_behavior_specs(outputs)

def get_behavior_names(self):
return list(self._env_specs.keys())
@property
def behavior_specs(self) -> MappingType[str, BehaviorSpec]:
return BehaviorMapping(self._env_specs)
def _assert_behavior_exists(self, behavior_name: str) -> None:
if behavior_name not in self._env_specs:

expected_shape = (len(self._env_state[behavior_name][0]), spec.action_size)
if action.shape != expected_shape:
raise UnityActionException(
"The behavior {0} needs an input of dimension {1} but received input of dimension {2}".format(
behavior_name, expected_shape, action.shape
)
"The behavior {0} needs an input of dimension {1} for "
"(<number of agents>, <action size>) but received input of "
"dimension {2}".format(behavior_name, expected_shape, action.shape)
)
if action.dtype != expected_type:
action = action.astype(expected_type)

self._assert_behavior_exists(behavior_name)
return self._env_state[behavior_name]
def get_behavior_spec(self, behavior_name: BehaviorName) -> BehaviorSpec:
self._assert_behavior_exists(behavior_name)
return self._env_specs[behavior_name]
def close(self):
"""
Sends a shutdown signal to the unity environment, and closes the socket connection.

force-killing it. Defaults to `self.timeout_wait`.
"""
if timeout is None:
timeout = self.timeout_wait
timeout = self._timeout_wait
self.communicator.close()
if self.proc1 is not None:
self._communicator.close()
if self._proc1 is not None:
self.proc1.wait(timeout=timeout)
signal_name = self.returncode_to_signal_name(self.proc1.returncode)
self._proc1.wait(timeout=timeout)
signal_name = self._returncode_to_signal_name(self._proc1.returncode)
return_info = f"Environment shut down with return code {self.proc1.returncode}{signal_name}."
return_info = f"Environment shut down with return code {self._proc1.returncode}{signal_name}."
self.proc1.kill()
self._proc1.kill()
self.proc1 = None
@classmethod
def _flatten(cls, arr: Any) -> List[float]:
"""
Converts arrays to list.
:param arr: numpy vector.
:return: flattened list.
"""
if isinstance(arr, cls.SCALAR_ACTION_TYPES):
arr = [float(arr)]
if isinstance(arr, np.ndarray):
arr = arr.tolist()
if len(arr) == 0:
return arr
if isinstance(arr[0], np.ndarray):
# pylint: disable=no-member
arr = [item for sublist in arr for item in sublist.tolist()]
if isinstance(arr[0], list):
# pylint: disable=not-an-iterable
arr = [item for sublist in arr for item in sublist]
arr = [float(x) for x in arr]
return arr
@staticmethod
def _parse_side_channel_message(
side_channels: Dict[uuid.UUID, SideChannel], data: bytes
) -> None:
offset = 0
while offset < len(data):
try:
channel_id = uuid.UUID(bytes_le=bytes(data[offset : offset + 16]))
offset += 16
message_len, = struct.unpack_from("<i", data, offset)
offset = offset + 4
message_data = data[offset : offset + message_len]
offset = offset + message_len
except Exception:
raise UnityEnvironmentException(
"There was a problem reading a message in a SideChannel. "
"Please make sure the version of MLAgents in Unity is "
"compatible with the Python version."
)
if len(message_data) != message_len:
raise UnityEnvironmentException(
"The message received by the side channel {0} was "
"unexpectedly short. Make sure your Unity Environment "
"sending side channel data properly.".format(channel_id)
)
if channel_id in side_channels:
incoming_message = IncomingMessage(message_data)
side_channels[channel_id].on_message_received(incoming_message)
else:
logger.warning(
"Unknown side channel data received. Channel type "
": {0}.".format(channel_id)
)
@staticmethod
def _generate_side_channel_data(
side_channels: Dict[uuid.UUID, SideChannel]
) -> bytearray:
result = bytearray()
for channel_id, channel in side_channels.items():
for message in channel.message_queue:
result += channel_id.bytes_le
result += struct.pack("<i", len(message))
result += message
channel.message_queue = []
return result
self._proc1 = None
@timed
def _generate_step_input(

action = AgentActionProto(vector_actions=vector_action[b][i])
rl_in.agent_actions[b].value.extend([action])
rl_in.command = STEP
rl_in.side_channel = bytes(self._generate_side_channel_data(self.side_channels))
return self.wrap_unity_input(rl_in)
rl_in.side_channel = bytes(
self._side_channel_manager.generate_side_channel_messages()
)
return self._wrap_unity_input(rl_in)
rl_in.side_channel = bytes(self._generate_side_channel_data(self.side_channels))
return self.wrap_unity_input(rl_in)
rl_in.side_channel = bytes(
self._side_channel_manager.generate_side_channel_messages()
)
return self._wrap_unity_input(rl_in)
def send_academy_parameters(
def _send_academy_parameters(
return self.communicator.initialize(inputs)
return self._communicator.initialize(inputs)
def wrap_unity_input(rl_input: UnityRLInputProto) -> UnityInputProto:
def _wrap_unity_input(rl_input: UnityRLInputProto) -> UnityInputProto:
def returncode_to_signal_name(returncode: int) -> Optional[str]:
def _returncode_to_signal_name(returncode: int) -> Optional[str]:
"""
Try to convert return codes into their corresponding signal name.
E.g. returncode_to_signal_name(-2) -> "SIGINT"

54
ml-agents-envs/mlagents_envs/tests/test_envs.py


from mlagents_envs.mock_communicator import MockCommunicator
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
assert env.get_behavior_names() == ["RealFakeBrain"]
assert list(env.behavior_specs.keys()) == ["RealFakeBrain"]
env.close()

(None, None, UnityEnvironment.DEFAULT_EDITOR_PORT),
],
)
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
def test_port_defaults(
mock_communicator, mock_launcher, base_port, file_name, expected
):

env = UnityEnvironment(file_name=file_name, worker_id=0, base_port=base_port)
assert expected == env.port
assert expected == env._port
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
args = env.executable_args()
args = env._executable_args()
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
spec = env.get_behavior_spec("RealFakeBrain")
spec = env.behavior_specs["RealFakeBrain"]
env.reset()
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
env.close()

assert (n_agents,) + shape == obs.shape
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
spec = env.get_behavior_spec("RealFakeBrain")
spec = env.behavior_specs["RealFakeBrain"]
env.step()
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)

assert 2 in terminal_steps
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
@mock.patch("mlagents_envs.env_utils.launch_executable")
@mock.patch("mlagents_envs.environment.UnityEnvironment._get_communicator")
def test_close(mock_communicator, mock_launcher):
comm = MockCommunicator(discrete_action=False, visual_inputs=0)
mock_communicator.return_value = comm

unity_ver = "1.0.0"
python_ver = "1.0.0"
unity_package_version = "0.15.0"
assert UnityEnvironment.check_communication_compatibility(
assert UnityEnvironment._check_communication_compatibility(
assert UnityEnvironment.check_communication_compatibility(
assert UnityEnvironment._check_communication_compatibility(
assert not UnityEnvironment.check_communication_compatibility(
assert not UnityEnvironment._check_communication_compatibility(
assert UnityEnvironment.check_communication_compatibility(
assert UnityEnvironment._check_communication_compatibility(
assert not UnityEnvironment.check_communication_compatibility(
assert not UnityEnvironment._check_communication_compatibility(
assert not UnityEnvironment.check_communication_compatibility(
assert not UnityEnvironment._check_communication_compatibility(
assert UnityEnvironment.returncode_to_signal_name(-2) == "SIGINT"
assert UnityEnvironment.returncode_to_signal_name(42) is None
assert UnityEnvironment.returncode_to_signal_name("SIGINT") is None
assert UnityEnvironment._returncode_to_signal_name(-2) == "SIGINT"
assert UnityEnvironment._returncode_to_signal_name(42) is None
assert UnityEnvironment._returncode_to_signal_name("SIGINT") is None
if __name__ == "__main__":

118
ml-agents-envs/mlagents_envs/tests/test_side_channel.py


import uuid
import pytest
from mlagents_envs.side_channel.side_channel_manager import SideChannelManager
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import (
EngineConfigurationChannel,
EngineConfig,
)
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
from mlagents_envs.side_channel.stats_side_channel import (
StatsSideChannel,
StatsAggregationMethod,
)
from mlagents_envs.exception import (
UnitySideChannelException,
UnityCommunicationException,
)
class IntChannel(SideChannel):

receiver = IntChannel()
sender.send_int(5)
sender.send_int(6)
data = UnityEnvironment._generate_side_channel_data({sender.channel_id: sender})
UnityEnvironment._parse_side_channel_message({receiver.channel_id: receiver}, data)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
assert receiver.list_int[0] == 5
assert receiver.list_int[1] == 6

sender.set_property("prop1", 1.0)
data = UnityEnvironment._generate_side_channel_data({sender.channel_id: sender})
UnityEnvironment._parse_side_channel_message({receiver.channel_id: receiver}, data)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
val = receiver.get_property("prop1")
assert val == 1.0

data = UnityEnvironment._generate_side_channel_data({sender.channel_id: sender})
UnityEnvironment._parse_side_channel_message({receiver.channel_id: receiver}, data)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
val = receiver.get_property("prop1")
assert val == 1.0

sender.send_raw_data("foo".encode("ascii"))
sender.send_raw_data("bar".encode("ascii"))
data = UnityEnvironment._generate_side_channel_data({sender.channel_id: sender})
UnityEnvironment._parse_side_channel_message({receiver.channel_id: receiver}, data)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
messages = receiver.get_and_clear_received_messages()
assert len(messages) == 2

# Test reading with defaults
assert [] == msg_in.read_float32_list()
assert val == msg_in.read_float32_list(default_value=val)
def test_engine_configuration():
sender = EngineConfigurationChannel()
# We use a raw bytes channel to interpred the data
receiver = RawBytesChannel(sender.channel_id)
config = EngineConfig.default_config()
sender.set_configuration(config)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
received_data = receiver.get_and_clear_received_messages()
assert len(received_data) == 5 # 5 different messages one for each setting
sent_time_scale = 4.5
sender.set_configuration_parameters(time_scale=sent_time_scale)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
message = IncomingMessage(receiver.get_and_clear_received_messages()[0])
message.read_int32()
time_scale = message.read_float32()
assert time_scale == sent_time_scale
with pytest.raises(UnitySideChannelException):
sender.set_configuration_parameters(width=None, height=42)
with pytest.raises(UnityCommunicationException):
# try to send data to the EngineConfigurationChannel
sender.set_configuration_parameters(time_scale=sent_time_scale)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([sender]).process_side_channel_message(data)
def test_environment_parameters():
sender = EnvironmentParametersChannel()
# We use a raw bytes channel to interpred the data
receiver = RawBytesChannel(sender.channel_id)
sender.set_float_parameter("param-1", 0.1)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
message = IncomingMessage(receiver.get_and_clear_received_messages()[0])
key = message.read_string()
dtype = message.read_int32()
value = message.read_float32()
assert key == "param-1"
assert dtype == EnvironmentParametersChannel.EnvironmentDataTypes.FLOAT
assert value - 0.1 < 1e-8
sender.set_float_parameter("param-1", 0.1)
sender.set_float_parameter("param-2", 0.1)
sender.set_float_parameter("param-3", 0.1)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([receiver]).process_side_channel_message(data)
assert len(receiver.get_and_clear_received_messages()) == 3
with pytest.raises(UnityCommunicationException):
# try to send data to the EngineConfigurationChannel
sender.set_float_parameter("param-1", 0.1)
data = SideChannelManager([sender]).generate_side_channel_messages()
SideChannelManager([sender]).process_side_channel_message(data)
def test_stats_channel():
receiver = StatsSideChannel()
message = OutgoingMessage()
message.write_string("stats-1")
message.write_float32(42.0)
message.write_int32(1) # corresponds to StatsAggregationMethod.MOST_RECENT
receiver.on_message_received(IncomingMessage(message.buffer))
stats = receiver.get_and_reset_stats()
assert len(stats) == 1
val, method = stats["stats-1"]
assert val - 42.0 < 1e-8
assert method == StatsAggregationMethod.MOST_RECENT

14
ml-agents/mlagents/trainers/learn.py


from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
from mlagents_envs.exception import UnityEnvironmentException
from mlagents_envs.timers import (
hierarchical_timer,
get_timer_tree,

os.path.join(base_path, options.run_id) if options.initialize_from else None
)
run_logs_dir = os.path.join(write_path, "run_logs")
port = options.base_port
port: Optional[int] = options.base_port
# Check if directory exists
handle_existing_directories(
write_path, options.resume, options.force, maybe_init_path

StatsReporter.add_writer(console_writer)
if options.env_path is None:
port = UnityEnvironment.DEFAULT_EDITOR_PORT
port = None
env_factory = create_environment_factory(
options.env_path,
options.no_graphics,

env_path: Optional[str],
no_graphics: bool,
seed: int,
start_port: int,
start_port: Optional[int],
if env_path is not None:
launch_string = UnityEnvironment.validate_environment_path(env_path)
if launch_string is None:
raise UnityEnvironmentException(
f"Couldn't launch the {env_path} environment. Provided filename does not match any environments."
)
def create_unity_environment(
worker_id: int, side_channels: List[SideChannel]
) -> UnityEnvironment:

25
ml-agents/mlagents/trainers/models.py


RESNET = "resnet"
class LearningRateSchedule(Enum):
class ScheduleType(Enum):
CONSTANT = "constant"
LINEAR = "linear"

return global_step, increment_step, steps_to_increment
@staticmethod
def create_learning_rate(
lr_schedule: LearningRateSchedule,
lr: float,
def create_schedule(
schedule: ScheduleType,
parameter: float,
min_value: float,
) -> tf.Tensor:
"""
Create a learning rate tensor.

:param max_step: The maximum number of steps in the training run.
:return: A Tensor containing the learning rate.
"""
if lr_schedule == LearningRateSchedule.CONSTANT:
learning_rate = tf.Variable(lr)
elif lr_schedule == LearningRateSchedule.LINEAR:
learning_rate = tf.train.polynomial_decay(
lr, global_step, max_step, 1e-10, power=1.0
if schedule == ScheduleType.CONSTANT:
parameter_rate = tf.Variable(parameter, trainable=False)
elif schedule == ScheduleType.LINEAR:
parameter_rate = tf.train.polynomial_decay(
parameter, global_step, max_step, min_value, power=1.0
raise UnityTrainerException(
"The learning rate schedule {} is invalid.".format(lr_schedule)
)
return learning_rate
raise UnityTrainerException("The schedule {} is invalid.".format(schedule))
return parameter_rate
@staticmethod
def scaled_init(scale):

1
ml-agents/mlagents/trainers/policy/tf_policy.py


)
)
if reset_global_steps:
self._set_step(0)
logger.info(
"Starting training from step 0 and saving to {}.".format(
self.model_path

34
ml-agents/mlagents/trainers/ppo/optimizer.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType, LearningRateSchedule
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer

super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
self._schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])

"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
if self.policy.use_recurrent:
self.m_size = self.policy.m_size

else:
self._create_dc_critic(h_size, num_layers, vis_encode_type)
self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy.total_log_probs,

"policy_loss": self.abs_policy_loss,
"update_batch": self.update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)

)
advantage = tf.expand_dims(self.advantage, -1)
decay_epsilon = tf.train.polynomial_decay(
epsilon, self.policy.global_step, max_step, 0.1, power=1.0
self.decay_epsilon = ModelUtils.create_schedule(
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
decay_beta = tf.train.polynomial_decay(
beta, self.policy.global_step, max_step, 1e-5, power=1.0
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)
value_losses = []

-decay_epsilon,
decay_epsilon,
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)

r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon)
tf.clip_by_value(
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
)
* advantage
)
self.policy_loss = -tf.reduce_mean(

self.loss = (
self.policy_loss
+ 0.5 * self.value_loss
- decay_beta
- self.decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
)

4
ml-agents/mlagents/trainers/ppo/trainer.py


self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for l in range(0, max_num_batch * batch_size, batch_size):
for i in range(0, max_num_batch * batch_size, batch_size):
buffer.make_mini_batch(l, l + batch_size), n_sequences
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

12
ml-agents/mlagents/trainers/sac/optimizer.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import LearningRateSchedule, EncoderType, ModelUtils
from mlagents.trainers.models import ScheduleType, EncoderType, ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

with tf.variable_scope(""):
super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
lr_schedule = ScheduleType(
trainer_params.get("learning_rate_schedule", "constant")
)
self.policy = policy

# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
self.m_size = 3 * self.policy.m_size
self._create_inputs_and_outputs()
self.learning_rate = ModelUtils.create_learning_rate(
lr_schedule, lr, self.policy.global_step, int(max_step)
self.learning_rate = ModelUtils.create_schedule(
lr_schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy_network.q1_heads,

8
ml-agents/mlagents/trainers/simple_env_manager.py


@property
def external_brains(self) -> Dict[BehaviorName, BrainParameters]:
result = {}
for brain_name in self.env.get_behavior_names():
result[brain_name] = behavior_spec_to_brain_parameters(
brain_name, self.env.get_behavior_spec(brain_name)
for behavior_name, behavior_spec in self.env.behavior_specs.items():
result[behavior_name] = behavior_spec_to_brain_parameters(
behavior_name, behavior_spec
)
return result

def _generate_all_results(self) -> AllStepResult:
all_step_result: AllStepResult = {}
for brain_name in self.env.get_behavior_names():
for brain_name in self.env.behavior_specs:
all_step_result[brain_name] = self.env.get_steps(brain_name)
return all_step_result

10
ml-agents/mlagents/trainers/subprocess_env_manager.py


def _generate_all_results() -> AllStepResult:
all_step_result: AllStepResult = {}
for brain_name in env.get_behavior_names():
for brain_name in env.behavior_specs:
for brain_name in env.get_behavior_names():
result[brain_name] = behavior_spec_to_brain_parameters(
brain_name, env.get_behavior_spec(brain_name)
for behavior_name, behavior_specs in env.behavior_specs.items():
result[behavior_name] = behavior_spec_to_brain_parameters(
behavior_name, behavior_specs
)
return result

return self.env_workers[0].recv().payload
def close(self) -> None:
logger.debug(f"SubprocessEnvManager closing.")
logger.debug("SubprocessEnvManager closing.")
self.step_queue.close()
self.step_queue.join_thread()
for env_worker in self.env_workers:

12
ml-agents/mlagents/trainers/tests/simple_test_envs.py


DecisionSteps,
TerminalSteps,
ActionType,
BehaviorMapping,
)
from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (

obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs
def get_behavior_names(self):
return self.names
def get_behavior_spec(self, behavior_name):
return self.behavior_spec
@property
def behavior_specs(self):
behavior_dict = {}
for n in self.names:
behavior_dict[n] = self.behavior_spec
return BehaviorMapping(behavior_dict)
def set_action_for_agent(self, behavior_name, agent_id, action):
pass

5
ml-agents/mlagents/trainers/tests/test_learn.py


def test_bad_env_path():
with pytest.raises(UnityEnvironmentException):
learn.create_environment_factory(
factory = learn.create_environment_factory(
seed=None,
seed=-1,
factory(worker_id=-1, side_channels=[])
@patch("builtins.open", new_callable=mock_open, read_data=MOCK_YAML)

4
ml-agents/mlagents/trainers/tests/test_nn_policy.py


trainer_params["output_path"] = path1
policy = create_policy_mock(trainer_params)
policy.initialize_or_load()
policy._set_step(2000)
policy.save_model(2000)
assert len(os.listdir(tmp_path)) > 0

policy2.initialize_or_load()
_compare_two_policies(policy, policy2)
assert policy2.get_current_step() == 2000
# Try initialize from path 1
trainer_params["model_path"] = path2

_compare_two_policies(policy2, policy3)
# Assert that the steps are 0.
assert policy3.get_current_step() == 0
def _compare_two_policies(policy1: NNPolicy, policy2: NNPolicy) -> None:

4
ml-agents/mlagents/trainers/tests/test_simple_rl.py


step_size=0.2,
)
override_vals = {
"max_steps": 750,
"max_steps": 1000,
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
"behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1500},
"reward_signals": {
"gail": {
"strength": 1.0,

6
ml-agents/tests/yamato/check_coverage_percent.py


# Rather than try to parse the XML, just look for a line of the form
# <Linecoverage>73.9</Linecoverage>
lines = f.readlines()
for l in lines:
if "Linecoverage" in l:
pct = l.replace("<Linecoverage>", "").replace("</Linecoverage>", "")
for line in lines:
if "Linecoverage" in line:
pct = line.replace("<Linecoverage>", "").replace("</Linecoverage>", "")
pct = float(pct)
if pct < min_percentage:
print(

4
ml-agents/tests/yamato/scripts/run_llapi.py


env.reset()
# Set the default brain to work with
group_name = env.get_behavior_names()[0]
group_spec = env.get_behavior_spec(group_name)
group_name = list(env.behavior_specs.keys())[0]
group_spec = env.behavior_specs[group_name]
# Set the time scale of the engine
engine_configuration_channel.set_configuration_parameters(time_scale=3.0)

21
ml-agents/tests/yamato/training_int_tests.py


run_standalone_build,
init_venv,
override_config_file,
override_legacy_config_file,
checkout_csharp_version,
undo_git_checkout,
)

print(
f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
)
nn_file_expected = f"./results/{run_id}/3DBall.nn"
output_dir = "models" if python_version else "results"
nn_file_expected = f"./{output_dir}/{run_id}/3DBall.nn"
if os.path.exists(nn_file_expected):
# Should never happen - make sure nothing leftover from an old test.
print("Artifacts from previous build found!")

# Copy the default training config but override the max_steps parameter,
# and reduce the batch_size and buffer_size enough to ensure an update step happens.
override_config_file(
"config/ppo/3DBall.yaml",
"override.yaml",
max_steps=100,
batch_size=10,
buffer_size=10,
)
overrides = {"max_steps": 100, "batch_size": 10, "buffer_size": 10}
yaml_out = "override.yaml"
if python_version:
override_legacy_config_file(
python_version, "config/trainer_config.yaml", yaml_out, **overrides
)
else:
override_config_file("config/ppo/3DBall.yaml", yaml_out, **overrides)
f"mlagents-learn override.yaml --train --env="
f"mlagents-learn {yaml_out} --force --env="
f"{os.path.join(get_base_output_path(), standalone_player_path)} "
f"--run-id={run_id} --no-graphics --env-args -logFile -"
) # noqa

28
ml-agents/tests/yamato/yamato_utils.py


if csharp_version is None:
return
csharp_tag = f"com.unity.ml-agents_{csharp_version}"
subprocess.check_call(
f"git checkout {csharp_version} -- {csharp_dir}", shell=True
)
subprocess.check_call(f"git checkout {csharp_tag} -- {csharp_dir}", shell=True)
def undo_git_checkout():

subprocess.check_call("git reset HEAD .", shell=True)
subprocess.check_call("git checkout -- .", shell=True)
# Ensure the cache isn't polluted with old compiled assemblies.
subprocess.check_call(f"rm -rf Project/Library", shell=True)
subprocess.check_call("rm -rf Project/Library", shell=True)
def override_config_file(src_path, dest_path, **kwargs):

with open(dest_path, "w") as f:
yaml.dump(configs, f)
def override_legacy_config_file(python_version, src_path, dest_path, **kwargs):
"""
Override settings in a trainer config file, using an old version of the src_path. For example,
override_config_file("0.16.0", src_path, dest_path, max_steps=42)
will sync the file at src_path from version 0.16.0, copy it to dest_path, and override the
max_steps field to 42 for all brains.
"""
# Sync the old version of the file
python_tag = f"python-packages_{python_version}"
subprocess.check_call(f"git checkout {python_tag} -- {src_path}", shell=True)
with open(src_path) as f:
configs = yaml.safe_load(f)
for config in configs.values():
config.update(**kwargs)
with open(dest_path, "w") as f:
yaml.dump(configs, f)

6
utils/validate_versions.py


def extract_version_string(filename):
with open(filename) as f:
for l in f.readlines():
if l.startswith(VERSION_LINE_START):
return l.replace(VERSION_LINE_START, "").strip()
for line in f.readlines():
if line.startswith(VERSION_LINE_START):
return line.replace(VERSION_LINE_START, "").strip()
return None

37
com.unity.ml-agents/Tests/Editor/Communicator/GrpcExtensionsTests.cs


using NUnit.Framework;
using UnityEngine;
using Unity.MLAgents.Policies;
using Unity.MLAgents.Demonstrations;
using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Tests
{
[TestFixture]
public class GrpcExtensionsTests
{
[Test]
public void TestDefaultBrainParametersToProto()
{
// Should be able to convert a default instance to proto.
var brain = new BrainParameters();
brain.ToProto("foo", false);
}
[Test]
public void TestDefaultAgentInfoToProto()
{
// Should be able to convert a default instance to proto.
var agentInfo = new AgentInfo();
agentInfo.ToInfoActionPairProto();
agentInfo.ToAgentInfoProto();
}
[Test]
public void TestDefaultDemonstrationMetaDataToProto()
{
// Should be able to convert a default instance to proto.
var demoMetaData = new DemonstrationMetaData();
demoMetaData.ToProto();
}
}
}

95
docs/Versioning.md


# ML-Agents Versioning
## Context
As the ML-Agents project evolves into a more mature product, we want to communicate the process
we use to version our packages and the data that flows into, through, and out of them clearly.
Our project now has four packages (1 Unity, 3 Python) along with artifacts that are produced as
well as consumed. This document covers the versioning for these packages and artifacts.
## GitHub Releases
Up until now, all packages were in lockstep in-terms of versioning. As a result, the GitHub releases
were tagged with the version of all those packages (e.g. v0.15.0, v0.15.1) and labeled accordingly.
With the decoupling of package versions, we now need to revisit our GitHub release tagging.
The proposal is that we move towards an integer release numbering for our repo and each such
release will call out specific version upgrades of each package. For instance, with
[the April 30th release](https://github.com/Unity-Technologies/ml-agents/releases/tag/release_1),
we will have:
- GitHub Release 1 (branch name: *release_1_branch*)
- com.unity.ml-agents release 1.0.0
- ml-agents release 0.16.0
- ml-agents-envs release 0.16.0
- gym-unity release 0.16.0
Our release cadence will not be affected by these versioning changes. We will keep having
monthly releases to fix bugs and release new features.
## Packages
All of the software packages, and their generated artifacts will be versioned. Any automation
tools will not be versioned.
### Unity package
Package name: com.unity.ml-agents
- Versioned following [Semantic Versioning Guidelines](https://www.semver.org)
- This package consumes an artifact of the training process: the `.nn` file. These files
are integer versioned and currently at version 2. The com.unity.ml-agents package
will need to support the version of `.nn` files which existed at its 1.0.0 release.
For example, consider that com.unity.ml-agents is at version 1.0.0 and the NN files
are at version 2. If the NN files change to version 3, the next release of
com.unity.ml-agents at version 1.1.0 guarantees it will be able to read both of these
formats. If the NN files were to change to version 4 and com.unity.ml-agents to
version 2.0.0, support for NN versions 2 and 3 could be dropped for com.unity.ml-agents
version 2.0.0.
- This package produces one artifact, the `.demo` files. These files will have integer
versioning. This means their version will increment by 1 at each change. The
com.unity.ml-agents package must be backward compatible with version changes
that occur between minor versions.
- To summarize, the artifacts produced and consumed by com.unity.ml-agents are guaranteed
to be supported for 1.x.x versions of com.unity.ml-agents. We intend to provide stability
for our users by moving to a 1.0.0 release of com.unity.ml-agents.
### Python Packages
Package names: ml-agents / ml-agents-envs / gym-unity
- The python packages remain in "Beta." This means that breaking changes to the public
API of the python packages can change without having to have a major version bump.
Historically, the python and C# packages were in version lockstep. This is no longer
the case. The python packages will remain in lockstep with each other for now, while the
C# package will follow its own versioning as is appropriate. However, the python package
versions may diverge in the future.
- While the python packages will remain in Beta for now, we acknowledge that the most
heavily used portion of our python interface is the `mlagents-learn` CLI and strive
to make this part of our API backward compatible. We are actively working on this and
expect to have a stable CLI in the next few weeks.
## Communicator
Packages which communicate: com.unity.ml-agents / ml-agents-envs
Another entity of the ML-Agents Toolkit that requires versioning is the communication layer
between C# and Python, which will follow also semantic versioning. This guarantees a level of
backward compatibility between different versions of C# and Python packages which communicate.
Any Communicator version 1.x.x of the Unity package should be compatible with any 1.x.x
Communicator Version in Python.
An RLCapabilities struct keeps track of which features exist. This struct is passed from C# to
Python, and another from Python to C#. With this feature level granularity, we can notify users
more specifically about feature limitations based on what's available in both C# and Python.
These notifications will be logged to the python terminal, or to the Unity Editor Console.
## Side Channels
The communicator is what manages data transfer between Unity and Python for the core
training loop. Side Channels are another means of data transfer between Unity and Python.
Side Channels are not versioned, but have been designed to support backward compatibility
for what they are. As of today, we provide 4 side channels:
- FloatProperties: shared float data between Unity - Python (bidirectional)
- RawBytes: raw data that can be sent Unity - Python (bidirectional)
- EngineConfig: a set of numeric fields in a pre-defined order sent from Python to Unity
- Stats: (name, value, agg) messages sent from Unity to Python
Aside from the specific implementations of side channels we provide (and use ourselves),
the Side Channel interface is made available for users to create their own custom side
channels. As such, we guarantee that the built in SideChannel interface between Unity and
Python is backward compatible in packages that share the same major version.

108
ml-agents-envs/mlagents_envs/env_utils.py


import glob
import os
import subprocess
from sys import platform
from typing import Optional, List
from mlagents_envs.logging_util import get_logger
from mlagents_envs.exception import UnityEnvironmentException
def get_platform():
"""
returns the platform of the operating system : linux, darwin or win32
"""
return platform
def validate_environment_path(env_path: str) -> Optional[str]:
"""
Strip out executable extensions of the env_path
:param env_path: The path to the executable
"""
env_path = (
env_path.strip()
.replace(".app", "")
.replace(".exe", "")
.replace(".x86_64", "")
.replace(".x86", "")
)
true_filename = os.path.basename(os.path.normpath(env_path))
get_logger(__name__).debug("The true file name is {}".format(true_filename))
if not (glob.glob(env_path) or glob.glob(env_path + ".*")):
return None
cwd = os.getcwd()
launch_string = None
true_filename = os.path.basename(os.path.normpath(env_path))
if get_platform() == "linux" or get_platform() == "linux2":
candidates = glob.glob(os.path.join(cwd, env_path) + ".x86_64")
if len(candidates) == 0:
candidates = glob.glob(os.path.join(cwd, env_path) + ".x86")
if len(candidates) == 0:
candidates = glob.glob(env_path + ".x86_64")
if len(candidates) == 0:
candidates = glob.glob(env_path + ".x86")
if len(candidates) > 0:
launch_string = candidates[0]
elif get_platform() == "darwin":
candidates = glob.glob(
os.path.join(cwd, env_path + ".app", "Contents", "MacOS", true_filename)
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(env_path + ".app", "Contents", "MacOS", true_filename)
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(cwd, env_path + ".app", "Contents", "MacOS", "*")
)
if len(candidates) == 0:
candidates = glob.glob(
os.path.join(env_path + ".app", "Contents", "MacOS", "*")
)
if len(candidates) > 0:
launch_string = candidates[0]
elif get_platform() == "win32":
candidates = glob.glob(os.path.join(cwd, env_path + ".exe"))
if len(candidates) == 0:
candidates = glob.glob(env_path + ".exe")
if len(candidates) > 0:
launch_string = candidates[0]
return launch_string
def launch_executable(file_name: str, args: List[str]) -> subprocess.Popen:
"""
Launches a Unity executable and returns the process handle for it.
:param file_name: the name of the executable
:param args: List of string that will be passed as command line arguments
when launching the executable.
"""
launch_string = validate_environment_path(file_name)
if launch_string is None:
raise UnityEnvironmentException(
f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
)
else:
get_logger(__name__).debug("This is the launch string {}".format(launch_string))
# Launch Unity environment
subprocess_args = [launch_string] + args
try:
return subprocess.Popen(
subprocess_args,
# start_new_session=True means that signals to the parent python process
# (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
# This is generally good since we want the environment to have a chance to shutdown,
# but may be undesirable in come cases; if so, we'll add a command-line toggle.
# Note that on Windows, the CTRL_C signal will still be sent.
start_new_session=True,
)
except PermissionError as perm:
# This is likely due to missing read or execute permissions on file.
raise UnityEnvironmentException(
f"Error when trying to launch environment - make sure "
f"permissions are set correctly. For example "
f'"chmod -R 755 {launch_string}"'
) from perm

81
ml-agents-envs/mlagents_envs/side_channel/side_channel_manager.py


import uuid
import struct
from typing import Dict, Optional, List
from mlagents_envs.side_channel import SideChannel, IncomingMessage
from mlagents_envs.exception import UnityEnvironmentException
from mlagents_envs.logging_util import get_logger
class SideChannelManager:
def __init__(self, side_channels=Optional[List[SideChannel]]):
self._side_channels_dict = self._get_side_channels_dict(side_channels)
def process_side_channel_message(self, data: bytes) -> None:
"""
Separates the data received from Python into individual messages for each
registered side channel and calls on_message_received on them.
:param data: The packed message sent by Unity
"""
offset = 0
while offset < len(data):
try:
channel_id = uuid.UUID(bytes_le=bytes(data[offset : offset + 16]))
offset += 16
message_len, = struct.unpack_from("<i", data, offset)
offset = offset + 4
message_data = data[offset : offset + message_len]
offset = offset + message_len
except (struct.error, ValueError, IndexError):
raise UnityEnvironmentException(
"There was a problem reading a message in a SideChannel. "
"Please make sure the version of MLAgents in Unity is "
"compatible with the Python version."
)
if len(message_data) != message_len:
raise UnityEnvironmentException(
"The message received by the side channel {0} was "
"unexpectedly short. Make sure your Unity Environment "
"sending side channel data properly.".format(channel_id)
)
if channel_id in self._side_channels_dict:
incoming_message = IncomingMessage(message_data)
self._side_channels_dict[channel_id].on_message_received(
incoming_message
)
else:
get_logger(__name__).warning(
f"Unknown side channel data received. Channel type: {channel_id}."
)
def generate_side_channel_messages(self) -> bytearray:
"""
Gathers the messages that the registered side channels will send to Unity
and combines them into a single message ready to be sent.
"""
result = bytearray()
for channel_id, channel in self._side_channels_dict.items():
for message in channel.message_queue:
result += channel_id.bytes_le
result += struct.pack("<i", len(message))
result += message
channel.message_queue = []
return result
@staticmethod
def _get_side_channels_dict(
side_channels: Optional[List[SideChannel]]
) -> Dict[uuid.UUID, SideChannel]:
"""
Converts a list of side channels into a dictionary of channel_id to SideChannel
:param side_channels: The list of side channels.
"""
side_channels_dict: Dict[uuid.UUID, SideChannel] = {}
if side_channels is not None:
for _sc in side_channels:
if _sc.channel_id in side_channels_dict:
raise UnityEnvironmentException(
f"There cannot be two side channels with "
f"the same channel id {_sc.channel_id}."
)
side_channels_dict[_sc.channel_id] = _sc
return side_channels_dict

64
ml-agents-envs/mlagents_envs/tests/test_env_utils.py


from unittest import mock
import pytest
from mlagents_envs.env_utils import validate_environment_path, launch_executable
from mlagents_envs.exception import UnityEnvironmentException
from mlagents_envs.logging_util import (
set_log_level,
get_logger,
INFO,
ERROR,
FATAL,
CRITICAL,
DEBUG,
)
def mock_glob_method(path):
"""
Given a path input, returns a list of candidates
"""
if ".x86" in path:
return ["linux"]
if ".app" in path:
return ["darwin"]
if ".exe" in path:
return ["win32"]
if "*" in path:
return "Any"
return []
@mock.patch("sys.platform")
@mock.patch("glob.glob")
def test_validate_path_empty(glob_mock, platform_mock):
glob_mock.return_value = None
path = validate_environment_path(" ")
assert path is None
@mock.patch("mlagents_envs.env_utils.get_platform")
@mock.patch("glob.glob")
def test_validate_path(glob_mock, platform_mock):
glob_mock.side_effect = mock_glob_method
for platform in ["linux", "darwin", "win32"]:
platform_mock.return_value = platform
path = validate_environment_path(" ")
assert path == platform
@mock.patch("glob.glob")
@mock.patch("subprocess.Popen")
def test_launch_executable(mock_popen, glob_mock):
with pytest.raises(UnityEnvironmentException):
launch_executable(" ", [])
glob_mock.return_value = ["FakeLaunchPath"]
launch_executable(" ", [])
mock_popen.side_effect = PermissionError("Fake permission error")
with pytest.raises(UnityEnvironmentException):
launch_executable(" ", [])
def test_set_logging_level():
for level in [INFO, ERROR, FATAL, CRITICAL, DEBUG]:
set_log_level(level)
assert get_logger("test").level == level

102
ml-agents-envs/mlagents_envs/tests/test_steps.py


import pytest
import numpy as np
from mlagents_envs.base_env import (
DecisionSteps,
TerminalSteps,
ActionType,
BehaviorSpec,
)
def test_decision_steps():
ds = DecisionSteps(
obs=[np.array(range(12), dtype=np.float32).reshape(3, 4)],
reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
action_mask=[np.zeros((3, 4), dtype=np.bool)],
)
assert ds.agent_id_to_index[10] == 0
assert ds.agent_id_to_index[11] == 1
assert ds.agent_id_to_index[12] == 2
with pytest.raises(KeyError):
assert ds.agent_id_to_index[-1] == -1
mask_agent = ds[10].action_mask
assert isinstance(mask_agent, list)
assert len(mask_agent) == 1
assert np.array_equal(mask_agent[0], np.zeros((4), dtype=np.bool))
for agent_id in ds:
assert ds.agent_id_to_index[agent_id] in range(3)
def test_empty_decision_steps():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
)
ds = DecisionSteps.empty(specs)
assert len(ds.obs) == 2
assert ds.obs[0].shape == (0, 3, 2)
assert ds.obs[1].shape == (0, 5)
def test_terminal_steps():
ts = TerminalSteps(
obs=[np.array(range(12), dtype=np.float32).reshape(3, 4)],
reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
interrupted=np.array([1, 0, 1], dtype=np.bool),
)
assert ts.agent_id_to_index[10] == 0
assert ts.agent_id_to_index[11] == 1
assert ts.agent_id_to_index[12] == 2
assert ts[10].interrupted
assert not ts[11].interrupted
assert ts[12].interrupted
with pytest.raises(KeyError):
assert ts.agent_id_to_index[-1] == -1
for agent_id in ts:
assert ts.agent_id_to_index[agent_id] in range(3)
def test_empty_terminal_steps():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
)
ts = TerminalSteps.empty(specs)
assert len(ts.obs) == 2
assert ts.obs[0].shape == (0, 3, 2)
assert ts.obs[1].shape == (0, 5)
def test_specs():
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.CONTINUOUS,
action_shape=3,
)
assert specs.discrete_action_branches is None
assert specs.action_size == 3
assert specs.create_empty_action(5).shape == (5, 3)
assert specs.create_empty_action(5).dtype == np.float32
specs = BehaviorSpec(
observation_shapes=[(3, 2), (5,)],
action_type=ActionType.DISCRETE,
action_shape=(3,),
)
assert specs.discrete_action_branches == (3,)
assert specs.action_size == 1
assert specs.create_empty_action(5).shape == (5, 1)
assert specs.create_empty_action(5).dtype == np.int32
正在加载...
取消
保存