浏览代码

Merge branch 'master' into sensitivity

/sensitivity
Andrew Cohen 4 年前
当前提交
06e4356c
共有 64 个文件被更改,包括 977 次插入561 次删除
  1. 2
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs
  2. 12
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs
  3. 53
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs
  4. 30
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs
  5. 18
      com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs
  6. 2
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs
  7. 13
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs
  8. 34
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs
  9. 22
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
  10. 5
      com.unity.ml-agents/CHANGELOG.md
  11. 2
      docs/Getting-Started.md
  12. 2
      docs/Training-Configuration-File.md
  13. 2
      docs/Training-on-Microsoft-Azure.md
  14. 4
      docs/Using-Docker.md
  15. 2
      docs/Using-Tensorboard.md
  16. 4
      docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md
  17. 2
      ml-agents-envs/setup.py
  18. 2
      ml-agents/mlagents/model_serialization.py
  19. 5
      ml-agents/mlagents/trainers/agent_processor.py
  20. 2
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  21. 2
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  22. 2
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  23. 29
      ml-agents/mlagents/trainers/env_manager.py
  24. 11
      ml-agents/mlagents/trainers/ghost/trainer.py
  25. 3
      ml-agents/mlagents/trainers/optimizer/optimizer.py
  26. 2
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  27. 166
      ml-agents/mlagents/trainers/policy/policy.py
  28. 334
      ml-agents/mlagents/trainers/policy/tf_policy.py
  29. 2
      ml-agents/mlagents/trainers/ppo/optimizer.py
  30. 21
      ml-agents/mlagents/trainers/ppo/trainer.py
  31. 4
      ml-agents/mlagents/trainers/sac/network.py
  32. 6
      ml-agents/mlagents/trainers/sac/optimizer.py
  33. 17
      ml-agents/mlagents/trainers/sac/trainer.py
  34. 12
      ml-agents/mlagents/trainers/settings.py
  35. 17
      ml-agents/mlagents/trainers/stats.py
  36. 2
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  37. 7
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  38. 2
      ml-agents/mlagents/trainers/tests/test_distributions.py
  39. 2
      ml-agents/mlagents/trainers/tests/test_models.py
  40. 17
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  41. 15
      ml-agents/mlagents/trainers/tests/test_ppo.py
  42. 6
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  43. 13
      ml-agents/mlagents/trainers/tests/test_sac.py
  44. 3
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  45. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  46. 3
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  47. 4
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  48. 9
      ml-agents/mlagents/trainers/trainer/trainer.py
  49. 36
      ml-agents/mlagents/trainers/trainer_controller.py
  50. 2
      ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
  51. 13
      ml-agents/mlagents/trainers/tf/models.py
  52. 2
      ml-agents/mlagents/trainers/tf/distributions.py
  53. 147
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs
  54. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs.meta
  55. 27
      com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs
  56. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs.meta
  57. 62
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs
  58. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs.meta
  59. 0
      ml-agents/mlagents/trainers/tf/__init__.py
  60. 285
      ml-agents/mlagents/trainers/policy/nn_policy.py
  61. 0
      /ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
  62. 0
      /ml-agents/mlagents/trainers/tf/models.py
  63. 0
      /ml-agents/mlagents/trainers/tf/distributions.py

2
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs


var t = go.transform;
return new Pose { rotation = t.rotation, position = t.position };
}
internal ArticulationBody[] Bodies => m_Bodies;
}
}
#endif // UNITY_2020_1_OR_NEWER

12
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs


// TODO static method in PhysicsBodySensor?
// TODO only update PoseExtractor when body changes?
var poseExtractor = new ArticulationBodyPoseExtractor(RootBody);
var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
return new[] { numTransformObservations };
var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
var numJointObservations = 0;
// Start from i=1 to ignore the root
for (var i = 1; i < poseExtractor.Bodies.Length; i++)
{
numJointObservations += ArticulationBodyJointExtractor.NumObservations(
poseExtractor.Bodies[i], Settings
);
}
return new[] { numPoseObservations + numJointObservations };
}
}

53
com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs


string m_SensorName;
PoseExtractor m_PoseExtractor;
IJointExtractor[] m_JointExtractors;
PhysicsSensorSettings m_Settings;
/// <summary>

/// <param name="sensorName"></param>
public PhysicsBodySensor(Rigidbody rootBody, GameObject rootGameObject, PhysicsSensorSettings settings, string sensorName=null)
{
m_PoseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
var poseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
m_PoseExtractor = poseExtractor;
var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
m_Shape = new[] { numTransformObservations };
var numJointExtractorObservations = 0;
var rigidBodies = poseExtractor.Bodies;
if (rigidBodies != null)
{
m_JointExtractors = new IJointExtractor[rigidBodies.Length - 1]; // skip the root
for (var i = 1; i < rigidBodies.Length; i++)
{
var jointExtractor = new RigidBodyJointExtractor(rigidBodies[i]);
numJointExtractorObservations += jointExtractor.NumObservations(settings);
m_JointExtractors[i - 1] = jointExtractor;
}
}
else
{
m_JointExtractors = new IJointExtractor[0];
}
var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
m_PoseExtractor = new ArticulationBodyPoseExtractor(rootBody);
var poseExtractor = new ArticulationBodyPoseExtractor(rootBody);
m_PoseExtractor = poseExtractor;
var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
m_Shape = new[] { numTransformObservations };
var numJointExtractorObservations = 0;
var articBodies = poseExtractor.Bodies;
if (articBodies != null)
{
m_JointExtractors = new IJointExtractor[articBodies.Length - 1]; // skip the root
for (var i = 1; i < articBodies.Length; i++)
{
var jointExtractor = new ArticulationBodyJointExtractor(articBodies[i]);
numJointExtractorObservations += jointExtractor.NumObservations(settings);
m_JointExtractors[i - 1] = jointExtractor;
}
}
else
{
m_JointExtractors = new IJointExtractor[0];
}
var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
}
#endif

public int Write(ObservationWriter writer)
{
var numWritten = writer.WritePoses(m_Settings, m_PoseExtractor);
foreach (var jointExtractor in m_JointExtractors)
{
numWritten += jointExtractor.Write(m_Settings, writer, numWritten);
}
return numWritten;
}

30
com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs


public bool UseLocalSpaceLinearVelocity;
/// <summary>
/// Whether to use joint-specific positions and angles as observations.
/// </summary>
public bool UseJointPositionsAndAngles;
/// <summary>
/// Whether to use the joint forces and torques that are applied by the solver as observations.
/// </summary>
public bool UseJointForces;
/// <summary>
/// Creates a PhysicsSensorSettings with reasonable default values.
/// </summary>
/// <returns></returns>

public bool UseLocalSpace
{
get { return UseLocalSpaceTranslations || UseLocalSpaceRotations || UseLocalSpaceLinearVelocity; }
}
/// <summary>
/// The number of floats needed to represent a given number of transforms.
/// </summary>
/// <param name="numTransforms"></param>
/// <returns></returns>
public int TransformSize(int numTransforms)
{
int obsPerTransform = 0;
obsPerTransform += UseModelSpaceTranslations ? 3 : 0;
obsPerTransform += UseModelSpaceRotations ? 4 : 0;
obsPerTransform += UseLocalSpaceTranslations ? 3 : 0;
obsPerTransform += UseLocalSpaceRotations ? 4 : 0;
obsPerTransform += UseModelSpaceLinearVelocity ? 3 : 0;
obsPerTransform += UseLocalSpaceLinearVelocity ? 3 : 0;
return numTransforms * obsPerTransform;
}
}

18
com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs


}
}
/// <summary>
/// Compute the number of floats needed to represent the poses for the given PhysicsSensorSettings.
/// </summary>
/// <param name="settings"></param>
/// <returns></returns>
public int GetNumPoseObservations(PhysicsSensorSettings settings)
{
int obsPerPose = 0;
obsPerPose += settings.UseModelSpaceTranslations ? 3 : 0;
obsPerPose += settings.UseModelSpaceRotations ? 4 : 0;
obsPerPose += settings.UseLocalSpaceTranslations ? 3 : 0;
obsPerPose += settings.UseLocalSpaceRotations ? 4 : 0;
obsPerPose += settings.UseModelSpaceLinearVelocity ? 3 : 0;
obsPerPose += settings.UseLocalSpaceLinearVelocity ? 3 : 0;
return NumPoses * obsPerPose;
}
internal void DrawModelSpace(Vector3 offset)
{

2
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs


var body = m_Bodies[index];
return new Pose { rotation = body.rotation, position = body.position };
}
internal Rigidbody[] Bodies => m_Bodies;
}
}

13
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs


// TODO static method in PhysicsBodySensor?
// TODO only update PoseExtractor when body changes?
var poseExtractor = new RigidBodyPoseExtractor(RootBody, gameObject);
var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
return new[] { numTransformObservations };
var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
var numJointObservations = 0;
// Start from i=1 to ignore the root
for (var i = 1; i < poseExtractor.Bodies.Length; i++)
{
var body = poseExtractor.Bodies[i];
var joint = body?.GetComponent<Joint>();
numJointObservations += RigidBodyJointExtractor.NumObservations(body, joint, Settings);
}
return new[] { numPoseObservations + numJointObservations };
}
}

34
com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs


0f, 0f, 0f, 1f // LocalSpaceRotations
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
[Test]

var leafArticBody = leafGameObj.AddComponent<ArticulationBody>();
leafGameObj.transform.SetParent(middleGamObj.transform);
leafGameObj.transform.localPosition = new Vector3(4.2f, 0f, 0f);
leafArticBody.jointType = ArticulationJointType.RevoluteJoint;
leafArticBody.jointType = ArticulationJointType.PrismaticJoint;
leafArticBody.linearLockZ = ArticulationDofLock.LimitedMotion;
leafArticBody.zDrive = new ArticulationDrive
{
lowerLimit = -3,
upperLimit = 1
};
#if UNITY_2020_2_OR_NEWER
// ArticulationBody.velocity is read-only in 2020.1

#endif
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
// Update the settings to only process joint observations
sensorComponent.Settings = new PhysicsSensorSettings
{
UseJointForces = true,
UseJointPositionsAndAngles = true,
};
sensor = sensorComponent.CreateSensor();
sensor.Update();
expected = new[]
{
// revolute
0f, 1f, // joint1.position (sin and cos)
0f, // joint1.force
// prismatic
0.5f, // joint2.position (interpolate between limits)
0f, // joint2.force
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
}
}

22
com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs


0f, 0f, 0f, 1f // LocalSpaceRotations
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
[Test]

0f, -1f, 1f // Leaf vel
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
// Update the settings to only process joint observations
sensorComponent.Settings = new PhysicsSensorSettings
{
UseJointPositionsAndAngles = true,
UseJointForces = true,
};
sensor = sensorComponent.CreateSensor();
sensor.Update();
expected = new[]
{
0f, 0f, 0f, // joint1.force
0f, 0f, 0f, // joint1.torque
0f, 0f, 0f, // joint2.force
0f, 0f, 0f, // joint2.torque
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
}

5
com.unity.ml-agents/CHANGELOG.md


### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
- The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
- The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
Python training by approximately 5-10%. (#4259)
### Minor Changes
#### com.unity.ml-agents (C#)

2
docs/Getting-Started.md


TensorBoard. From the command line run:
```sh
tensorboard --logdir=results
tensorboard --logdir results
```
Then navigate to `localhost:6006` in your browser to view the TensorBoard

2
docs/Training-Configuration-File.md


| `hyperparameters -> beta` | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2` |
| `hyperparameters -> epsilon` | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3` |
| `hyperparameters -> lambd` | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
| `hyperparameters -> num_epoch` | (default = `3`) Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
### SAC-specific Configurations

2
docs/Training-on-Microsoft-Azure.md


2. Unless you started the training as a background process, connect to your VM
from another terminal instance.
3. Run the following command from your terminal
`tensorboard --logdir=summaries --host 0.0.0.0`
`tensorboard --logdir results --host 0.0.0.0`
4. You should now be able to open a browser and navigate to
`<Your_VM_IP_Address>:6060` to view the TensorBoard report.

4
docs/Using-Docker.md


http://localhost:6006:
```sh
docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
docker exec -it <container-name> tensorboard --logdir /unity-volume/results --host 0.0.0.0
docker exec -it 3DBallContainer.first.trial tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
docker exec -it 3DBallContainer.first.trial tensorboard --logdir /unity-volume/results --host 0.0.0.0
```
For more details on Tensorboard, check out the documentation about

2
docs/Using-Tensorboard.md


1. Open a terminal or console window:
1. Navigate to the directory where the ML-Agents Toolkit is installed.
1. From the command line run: `tensorboard --logdir=results --port=6006`
1. From the command line run: `tensorboard --logdir results --port 6006`
1. Open a browser window and navigate to
[localhost:6006](http://localhost:6006).

4
docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md


### 观测训练进度
开始使用 `learn.py` 按照前面部分所述的方式进行训练后,`ml-agents` 文件夹将
包含一个 `summaries` 目录。为了更详细地观测训练过程,
包含一个 `results` 目录。为了更详细地观测训练过程,
`tensorboard --logdir=summaries`
`tensorboard --logdir results`
然后导航至 `localhost:6006`

2
ml-agents-envs/setup.py


install_requires=[
"cloudpickle",
"grpcio>=1.11.0",
"numpy>=1.14.1,<2.0",
"numpy>=1.14.1,<1.19.0",
"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml>=3.1.0",

2
ml-agents/mlagents/model_serialization.py


from tensorflow.python.framework import graph_util
from mlagents_envs.logging_util import get_logger
from mlagents.trainers import tensorflow_to_barracuda as tf2bc
from mlagents.trainers.tf import tensorflow_to_barracuda as tf2bc
if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
# ONNX is only tested on 1.12.0 and later

5
ml-agents/mlagents/trainers/agent_processor.py


EnvironmentStats,
)
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.stats import StatsReporter

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

2
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


"""
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)
:param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
:param policy: The Policy object (e.g. TFPolicy) that this Reward Signal will apply to.
:param settings: Settings parameters for this Reward Signal, including gamma and strength.
:return: A RewardSignal object.
"""

2
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


from typing import List, Tuple
from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy

2
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


from mlagents.tf_utils import tf
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
EPSILON = 1e-7

29
ml-agents/mlagents/trainers/env_manager.py


)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.logging_util import get_logger

class EnvManager(ABC):
def __init__(self):
self.policies: Dict[BehaviorName, TFPolicy] = {}
self.policies: Dict[BehaviorName, Policy] = {}
self.first_step_infos: List[EnvironmentStep] = None
self.first_step_infos: List[EnvironmentStep] = []
def set_policy(self, brain_name: BehaviorName, policy: TFPolicy) -> None:
def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None:
self.policies[brain_name] = policy
if brain_name in self.agent_managers:
self.agent_managers[brain_name].policy = policy

def close(self):
pass
def advance(self):
def get_steps(self) -> List[EnvironmentStep]:
"""
Updates the policies, steps the environments, and returns the step information from the environments.
Calling code should pass the returned EnvironmentSteps to process_steps() after calling this.
:return: The list of EnvironmentSteps
"""
if self.first_step_infos is not None:
if self.first_step_infos:
self.first_step_infos = None
self.first_step_infos = []
for brain_name in self.training_behaviors:
for brain_name in self.agent_managers.keys():
_policy = None
try:
# We make sure to empty the policy queue before continuing to produce steps.

except AgentManagerQueue.Empty:
if _policy is not None:
self.set_policy(brain_name, _policy)
# Step the environment
# policy_queue contains Policy, but we need a TFPolicy here
self.set_policy(brain_name, _policy) # type: ignore
# Step the environments
return new_step_infos
def process_steps(self, new_step_infos: List[EnvironmentStep]) -> int:
# Add to AgentProcessor
num_step_infos = self._process_step_infos(new_step_infos)
return num_step_infos

11
ml-agents/mlagents/trainers/ghost/trainer.py


# ## ML-Agent Learning (Ghost Trainer)
from collections import defaultdict
from typing import Deque, Dict, DefaultDict, List, cast
from typing import Deque, Dict, DefaultDict, List
import numpy as np

from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trajectory import Trajectory

for brain_name in self._internal_policy_queues:
internal_policy_queue = self._internal_policy_queues[brain_name]
try:
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
policy = internal_policy_queue.get_nowait()
self.current_policy_snapshot[brain_name] = policy.get_weights()
except AgentManagerQueue.Empty:
pass

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
) -> Policy:
"""
Creates policy with the wrapped trainer's create_policy function
The first policy encountered sets the wrapped

return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to GhostTrainer.

self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
self.policies[name_behavior_id] = policy
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy associated with name_behavior_id
:param name_behavior_id: Fully qualified behavior name

3
ml-agents/mlagents/trainers/optimizer/optimizer.py


Provides methods to update the Policy.
"""
def __init__(self):
self.reward_signals = {}
@abc.abstractmethod
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""

2
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


class TFOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
super().__init__()
self.sess = policy.sess
self.policy = policy
self.update_dict: Dict[str, tf.Tensor] = {}

Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later

166
ml-agents/mlagents/trainers/policy/policy.py


from abc import ABC, abstractmethod
from abc import abstractmethod
from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.exception import UnityException
from mlagents.model_serialization import SerializationSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
class Policy(ABC):
@abstractmethod
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class Policy:
def __init__(
self,
seed: int,
behavior_spec: BehaviorSpec,
trainer_settings: TrainerSettings,
model_path: str,
load: bool = False,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
):
self.behavior_spec = behavior_spec
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()
else [behavior_spec.action_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.model_path = model_path
self.initialize_path = self.trainer_settings.init_path
self._keep_checkpoints = self.trainer_settings.keep_checkpoints
self.use_continuous_act = behavior_spec.is_action_continuous()
self.num_branches = self.behavior_spec.action_size
self.previous_action_dict: Dict[str, np.array] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None
self.load = load
self.h_size = self.network_settings.hidden_units
num_layers = self.network_settings.num_layers
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = self.network_settings.vis_encode_type
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
self.m_size = 0
self.sequence_length = 1
if self.network_settings.memory is not None:
self.m_size = self.network_settings.memory.memory_size
self.sequence_length = self.network_settings.memory.sequence_length
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
raise NotImplementedError
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
pass
@abstractmethod
def increment_step(self, n_steps):
pass
@abstractmethod
def get_current_step(self):
pass
@abstractmethod
def checkpoint(self, checkpoint_path: str, settings: SerializationSettings) -> None:
pass
@abstractmethod
def save(self, output_filepath: str, settings: SerializationSettings) -> None:
pass
@abstractmethod
def load_weights(self, values: List[np.ndarray]) -> None:
pass
@abstractmethod
def get_weights(self) -> List[np.ndarray]:
return []
@abstractmethod
def init_load_weights(self) -> None:
pass

334
ml-agents/mlagents/trainers/policy/tf_policy.py


from typing import Any, Dict, List, Optional, Tuple
import abc
from mlagents_envs.timers import timed
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents.tf_utils import tf

from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers.tf.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
)
logger = get_logger(__name__)

# determines compatibility with inference in Barracuda.
MODEL_FORMAT_VERSION = 2
EPSILON = 1e-6 # Small value to avoid divide by zero
class UnityPolicyException(UnityException):
"""

trainer_settings: TrainerSettings,
model_path: str,
load: bool = False,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
create_tf_graph: bool = True,
):
"""
Initialized the policy.

:param model_path: Where to load/save the model.
:param load: If True, load model from model_path. Otherwise, create new model.
"""
self.m_size = 0
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
super().__init__(
seed,
behavior_spec,
trainer_settings,
model_path,
load,
tanh_squash,
reparameterize,
condition_sigma_on_obs,
)
self.inference_dict: Dict[str, tf.Tensor] = {}
self.sequence_length = 1
self.seed = seed
self.behavior_spec = behavior_spec
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()
else [behavior_spec.action_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.inference_dict: Dict[str, tf.Tensor] = {}
self.use_recurrent = self.network_settings.memory is not None
self.memory_dict: Dict[str, np.ndarray] = {}
self.num_branches = self.behavior_spec.action_size
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = self.network_settings.normalize
self.use_continuous_act = behavior_spec.is_action_continuous()
self.model_path = model_path
self.initialize_path = self.trainer_settings.init_path
self.keep_checkpoints = self.trainer_settings.keep_checkpoints
self.seed = seed
if self.network_settings.memory is not None:
self.m_size = self.network_settings.memory.memory_size
self.sequence_length = self.network_settings.memory.sequence_length
self.load = load
self.grads = None
self.update_batch: Optional[tf.Operation] = None
self.trainable_variables: List[tf.Variable] = []
if create_tf_graph:
self.create_tf_graph()
@abc.abstractmethod
pass
return self.trainable_variables
@abc.abstractmethod
def create_tf_graph(self):
def create_tf_graph(self) -> None:
pass
with self.graph.as_default():
tf.set_random_seed(self.seed)
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
if len(_vars) > 0:
# We assume the first thing created in the graph is the Policy. If
# already populated, don't create more tensors.
return
self.create_input_placeholders()
encoded = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.num_layers,
self.vis_encode_type,
)
if self.use_continuous_act:
self._create_cc_actor(
encoded,
self.tanh_squash,
self.reparameterize,
self.condition_sigma_on_obs,
)
else:
self._create_dc_actor(encoded)
self.saliency = tf.reduce_mean(
tf.square(tf.gradients(self.output, self.vector_in)), axis=1
)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
) # LSTMs need to be root scope for Barracuda export
self.inference_dict = {
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.memory_out
# We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
# it will re-load the full graph
self._initialize_graph()
def _create_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("policy"):
encoded = ModelUtils.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
return encoded
@staticmethod
def _convert_version_string(version_string: str) -> Tuple[int, ...]:

def _initialize_graph(self):
with self.graph.as_default():
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
logger.info(f"Loading model from {model_path}.")
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt is None:

feed_dict[assign_ph] = value
self.sess.run(self.assign_ops, feed_dict=feed_dict)
@timed
:param decision_requests: DecisionSteps input to network.
:return: Output from policy based on self.inference_dict.
:param decision_requests: DecisionSteps object containing inputs.
:param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
:return: Outputs from network as defined by self.inference_dict.
raise UnityPolicyException("The evaluate function was not implemented.")
feed_dict = {
self.batch_size_ph: len(decision_requests),
self.sequence_length_ph: 1,
}
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0

mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
feed_dict[self.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
def get_current_step(self):
"""

trainable=False,
dtype=tf.int32,
)
def _create_cc_actor(
self,
encoded: tf.Tensor,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy.
"""
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
with tf.variable_scope("policy"):
distribution = GaussianDistribution(
hidden_policy,
self.act_size,
reparameterize=reparameterize,
tanh_squash=tanh_squash,
condition_sigma=condition_sigma_on_obs,
)
if tanh_squash:
self.output_pre = distribution.sample
self.output = tf.identity(self.output_pre, name="action")
else:
self.output_pre = distribution.sample
# Clip and scale output to ensure actions are always within [-1, 1] range.
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(self.output)
self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
self.entropy = distribution.entropy
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(self, encoded: tf.Tensor) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
"""
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
hidden_policy,
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
else:
hidden_policy = encoded
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
with tf.variable_scope("policy"):
distribution = MultiCategoricalDistribution(
hidden_policy, self.act_size, self.action_masks
)
# It's important that we are able to feed_dict a value into this tensor to get the
# right one-hot encoding, so we can't do identity on it.
self.output = distribution.sample
self.all_log_probs = tf.identity(distribution.log_probs, name="action")
self.selected_actions = tf.stop_gradient(
distribution.sample_onehot
) # In discrete, these are onehot
self.entropy = distribution.entropy
self.total_log_probs = distribution.total_log_probs

2
ml-agents/mlagents/trainers/ppo/optimizer.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.tf.models import ModelUtils, EncoderType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer

21
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory

)
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""

:param behavior_spec: specifications for policy construction
:return policy
"""
policy = NNPolicy(
policy = TFPolicy(
self.is_training,
self.artifact_path,
self.load,
model_path=self.artifact_path,
load=self.load,
condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
)

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

4
ml-agents/mlagents/trainers/sac/network.py


from typing import Dict, Optional
from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import EncoderType
LOG_STD_MAX = 2
LOG_STD_MIN = -20

self._create_memory_ins(m_size)
hidden_critic = self._create_observation_in(vis_encode_type)
self.policy.output = self.policy.output
# Use the sequence length of the policy
self.sequence_length_ph = self.policy.sequence_length_ph

6
ml-agents/mlagents/trainers/sac/optimizer.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

# Non-exposed SAC parameters
self.discrete_target_entropy_scale = (
0.2
) # Roughly equal to e-greedy 0.05
0.2 # Roughly equal to e-greedy 0.05
)
self.continuous_target_entropy_scale = 1.0
stream_names = list(self.reward_signals.keys())

17
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations

self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
policy = NNPolicy(
policy = TFPolicy(
self.is_training,
self.artifact_path,
self.load,
tanh_squash=True,

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

max(1, self.step / self.reward_signal_steps_per_update)
)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

12
ml-agents/mlagents/trainers/settings.py


from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
from mlagents.trainers.cli_utils import load_config
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.models import ScheduleType, EncoderType
from mlagents_envs import logging_util
from mlagents_envs.side_channel.environment_parameters_channel import (

class ExportableSettings:
def as_dict(self):
return cattr.unstructure(self)
class EncoderType(Enum):
SIMPLE = "simple"
NATURE_CNN = "nature_cnn"
RESNET = "resnet"
class ScheduleType(Enum):
CONSTANT = "constant"
LINEAR = "linear"
@attr.s(auto_attribs=True)

17
ml-agents/mlagents/trainers/stats.py


from collections import defaultdict
from enum import Enum
from typing import List, Dict, NamedTuple, Any
from typing import List, Dict, NamedTuple, Any, Optional
import numpy as np
import abc
import csv

) -> None:
if property_type == StatsPropertyType.HYPERPARAMETERS:
assert isinstance(value, dict)
text = self._dict_to_tensorboard("Hyperparameters", value)
summary = self._dict_to_tensorboard("Hyperparameters", value)
self.summary_writers[category].add_summary(text, 0)
if summary is not None:
self.summary_writers[category].add_summary(summary, 0)
elif property_type == StatsPropertyType.SALIENCY:
self._maybe_create_summary_writer(category)

#with tf.Session(config=generate_session_config()) as sess:
# hist_op = tf.summary.histogram(category, value)
# hist = sess.run(hist_op)
def _dict_to_tensorboard(self, name: str, input_dict: Dict[str, Any]) -> str:
def _dict_to_tensorboard(
self, name: str, input_dict: Dict[str, Any]
) -> Optional[bytes]:
"""
Convert a dict to a Tensorboard-encoded string.
:param name: The name of the text.

s = sess.run(s_op)
return s
except Exception:
logger.warning("Could not write text summary for Tensorboard.")
return ""
logger.warning(
f"Could not write {name} summary for Tensorboard: {input_dict}"
)
return None
class CSVWriter(StatsWriter):

2
ml-agents/mlagents/trainers/tests/test_barracuda_converter.py


import tempfile
import pytest
import mlagents.trainers.tensorflow_to_barracuda as tf2bc
import mlagents.trainers.tf.tensorflow_to_barracuda as tf2bc
from mlagents.trainers.tests.test_nn_policy import create_policy_mock
from mlagents.trainers.settings import TrainerSettings
from mlagents.tf_utils import tf

7
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import numpy as np
import os
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.settings import (
TrainerSettings,

trainer_config.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = NNPolicy(
policy = TFPolicy(
False,
"test",
False,
tanhresample,

assert isinstance(item, np.float32)
old_learning_rate = bc_module.current_lr
stats = bc_module.update()
_ = bc_module.update()
assert old_learning_rate == bc_module.current_lr

2
ml-agents/mlagents/trainers/tests/test_distributions.py


from mlagents.tf_utils import tf
from mlagents.trainers.distributions import (
from mlagents.trainers.tf.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
)

2
ml-agents/mlagents/trainers/tests/test_models.py


import pytest
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
from mlagents.tf_utils import tf
from mlagents_envs.base_env import BehaviorSpec, ActionType

17
ml-agents/mlagents/trainers/tests/test_nn_policy.py


from mlagents.tf_utils import tf
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.models import EncoderType, ModelUtils, Tensor3DShape
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.tf.models import ModelUtils, Tensor3DShape
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.settings import TrainerSettings, NetworkSettings, EncoderType
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers import __version__

model_path: str = "",
load: bool = False,
seed: int = 0,
) -> NNPolicy:
) -> TFPolicy:
mock_spec = mb.setup_test_behavior_specs(
use_discrete,
use_visual,

trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = NNPolicy(seed, mock_spec, trainer_settings, False, model_path, load)
policy = TFPolicy(
seed, mock_spec, trainer_settings, model_path=model_path, load=load
)
return policy

assert len(cm.output) == 1
def _compare_two_policies(policy1: NNPolicy, policy2: NNPolicy) -> None:
def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None:
"""
Make sure two policies have the same output for the same input.
"""

# Change half of the obs to 0
for i in range(3):
trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
policy = NNPolicy(
policy = TFPolicy(
False,
"testdir",
False,
)

15
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

if use_rnn
else None
)
policy = NNPolicy(
0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
policy = TFPolicy(
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
)
optimizer = PPOOptimizer(policy, trainer_settings)
return optimizer

ppo_optimizer.return_value = mock_optimizer
trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
policy_mock = mock.Mock(spec=NNPolicy)
policy_mock = mock.Mock(spec=TFPolicy)
policy_mock.get_current_step.return_value = 0
step_count = (
5 # 10 hacked because this function is no longer called through trainer

ppo_optimizer.return_value = mock_optimizer
trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy = mock.Mock(spec=TFPolicy)
policy.get_current_step.return_value = 2000
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)

# Make sure the summary steps were loaded properly
assert trainer.get_step == 2000
# Test incorrect class of policy
policy = mock.Mock()
with pytest.raises(RuntimeError):
trainer.add_policy(behavior_id, policy)
if __name__ == "__main__":

6
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import copy
import os
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.tests.test_simple_rl import PPO_CONFIG, SAC_CONFIG

if use_rnn
else None
)
policy = NNPolicy(
0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False
policy = TFPolicy(
0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
)
if trainer_settings.trainer_type == TrainerType.SAC:
optimizer = SACOptimizer(policy, trainer_settings)

13
ml-agents/mlagents/trainers/tests/test_sac.py


from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import setup_test_behavior_specs

if use_rnn
else None
)
policy = NNPolicy(
0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
policy = TFPolicy(
0, mock_brain, trainer_settings, "test", False, create_tf_graph=False
)
optimizer = SACOptimizer(policy, trainer_settings)
return optimizer

sac_optimizer.return_value = mock_optimizer
trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy = mock.Mock(spec=TFPolicy)
policy.get_current_step.return_value = 2000
behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
trainer.add_policy(behavior_id, policy)

assert trainer.get_step == 2000
# Test incorrect class of policy
policy = mock.Mock()
with pytest.raises(RuntimeError):
trainer.add_policy(behavior_id, policy)
def test_advance(dummy_config):

3
ml-agents/mlagents/trainers/tests/test_simple_rl.py


GAILSettings,
TrainerType,
RewardSignalType,
EncoderType,
ScheduleType,
from mlagents.trainers.models import EncoderType, ScheduleType
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)

2
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


}
step_info = EnvironmentStep(step_info_dict, 0, action_info_dict, env_stats)
step_mock.return_value = [step_info]
env_manager.advance()
env_manager.process_steps(env_manager.get_steps())
# Test add_experiences
env_manager._step.assert_called_once()

3
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


tc.advance(env_mock)
env_mock.reset.assert_not_called()
env_mock.advance.assert_called_once()
env_mock.get_steps.assert_called_once()
env_mock.process_steps.assert_called_once()
# May have been called many times due to thread
trainer_mock.advance.call_count > 0

4
ml-agents/mlagents/trainers/trainer/rl_trainer.py


)
from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.components.reward_signals import RewardSignalResult

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(

9
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue

self.step: int = 0
self.artifact_path = artifact_path
self.summary_freq = self.trainer_settings.summary_freq
self.policies: Dict[str, TFPolicy] = {}
self.policies: Dict[str, Policy] = {}
@property
def stats_reporter(self):

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
) -> Policy:
"""
Creates policy
"""

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer.
"""

36
ml-agents/mlagents/trainers/trainer_controller.py


from mlagents.tf_utils import tf
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
from mlagents_envs.exception import (
UnityEnvironmentException,
UnityCommunicationException,

self.train_model = train
self.param_manager = param_manager
self.ghost_controller = self.trainer_factory.ghost_controller
self.registered_behavior_ids: Set[str] = set()
self.trainer_threads: List[threading.Thread] = []
self.kill_trainers = False

)
@timed
def _reset_env(self, env: EnvManager) -> None:
def _reset_env(self, env_manager: EnvManager) -> None:
"""Resets the environment.
Returns:

new_config = self.param_manager.get_current_samplers()
env.reset(config=new_config)
env_manager.reset(config=new_config)
# Register any new behavior ids that were generated on the reset.
self._register_new_behaviors(env_manager, env_manager.first_step_infos)
def _not_done_training(self) -> bool:
return (

def start_learning(self, env_manager: EnvManager) -> None:
self._create_output_path(self.output_path)
tf.reset_default_graph()
last_brain_behavior_ids: Set[str] = set()
external_brain_behavior_ids = set(env_manager.training_behaviors.keys())
new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
self._create_trainers_and_managers(env_manager, new_behavior_ids)
last_brain_behavior_ids = external_brain_behavior_ids
n_steps = self.advance(env_manager)
for _ in range(n_steps):
self.reset_env_if_ready(env_manager)

env.set_env_parameters(self.param_manager.get_current_samplers())
@timed
def advance(self, env: EnvManager) -> int:
def advance(self, env_manager: EnvManager) -> int:
num_steps = env.advance()
new_step_infos = env_manager.get_steps()
self._register_new_behaviors(env_manager, new_step_infos)
num_steps = env_manager.process_steps(new_step_infos)
# Report current lesson for each environment parameter
for (

trainer.advance()
return num_steps
def _register_new_behaviors(
self, env_manager: EnvManager, step_infos: List[EnvironmentStep]
) -> None:
"""
Handle registration (adding trainers and managers) of new behaviors ids.
:param env_manager:
:param step_infos:
:return:
"""
step_behavior_ids: Set[str] = set()
for s in step_infos:
step_behavior_ids |= set(s.name_behavior_ids)
new_behavior_ids = step_behavior_ids - self.registered_behavior_ids
self._create_trainers_and_managers(env_manager, new_behavior_ids)
self.registered_behavior_ids |= step_behavior_ids
def join_threads(self, timeout_seconds: float = 1.0) -> None:
"""

2
ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py


)
# Te following code can be used as an example of API used from another module
# convert() is the main entry point for converter
import tensorflow_to_barracuda as tf2bc
import tf.tensorflow_to_barracuda as tf2bc
tf2bc.convert(args.source_file, args.target_file, args.trim_unused_by_output, args)

13
ml-agents/mlagents/trainers/tf/models.py


from enum import Enum
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException

height: int
width: int
num_channels: int
class EncoderType(Enum):
SIMPLE = "simple"
NATURE_CNN = "nature_cnn"
RESNET = "resnet"
class ScheduleType(Enum):
CONSTANT = "constant"
LINEAR = "linear"
class NormalizerTensors(NamedTuple):

2
ml-agents/mlagents/trainers/tf/distributions.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
EPSILON = 1e-6 # Small value to avoid divide by zero

147
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs


#if UNITY_2020_1_OR_NEWER
using System.Collections.Generic;
using UnityEngine;
using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Extensions.Sensors
{
public class ArticulationBodyJointExtractor : IJointExtractor
{
ArticulationBody m_Body;
public ArticulationBodyJointExtractor(ArticulationBody body)
{
m_Body = body;
}
public int NumObservations(PhysicsSensorSettings settings)
{
return NumObservations(m_Body, settings);
}
public static int NumObservations(ArticulationBody body, PhysicsSensorSettings settings)
{
if (body == null || body.isRoot)
{
return 0;
}
var totalCount = 0;
if (settings.UseJointPositionsAndAngles)
{
switch (body.jointType)
{
case ArticulationJointType.RevoluteJoint:
case ArticulationJointType.SphericalJoint:
// Both RevoluteJoint and SphericalJoint have all angular components.
// We use sine and cosine of the angles for the observations.
totalCount += 2 * body.dofCount;
break;
case ArticulationJointType.FixedJoint:
// Since FixedJoint can't moved, there aren't any interesting observations for it.
break;
case ArticulationJointType.PrismaticJoint:
// One linear component
totalCount += body.dofCount;
break;
}
}
if (settings.UseJointForces)
{
totalCount += body.dofCount;
}
return totalCount;
}
public int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset)
{
if (m_Body == null || m_Body.isRoot)
{
return 0;
}
var currentOffset = offset;
// Write joint positions
if (settings.UseJointPositionsAndAngles)
{
switch (m_Body.jointType)
{
case ArticulationJointType.RevoluteJoint:
case ArticulationJointType.SphericalJoint:
// All joint positions are angular
for (var dofIndex = 0; dofIndex < m_Body.dofCount; dofIndex++)
{
var jointRotationRads = m_Body.jointPosition[dofIndex];
writer[currentOffset++] = Mathf.Sin(jointRotationRads);
writer[currentOffset++] = Mathf.Cos(jointRotationRads);
}
break;
case ArticulationJointType.FixedJoint:
// No observations
break;
case ArticulationJointType.PrismaticJoint:
writer[currentOffset++] = GetPrismaticValue();
break;
}
}
if (settings.UseJointForces)
{
for (var dofIndex = 0; dofIndex < m_Body.dofCount; dofIndex++)
{
// take tanh to keep in [-1, 1]
writer[currentOffset++] = (float) System.Math.Tanh(m_Body.jointForce[dofIndex]);
}
}
return currentOffset - offset;
}
float GetPrismaticValue()
{
// Prismatic joints should have at most one free axis.
bool limited = false;
var drive = m_Body.xDrive;
if (m_Body.linearLockX == ArticulationDofLock.LimitedMotion)
{
drive = m_Body.xDrive;
limited = true;
}
else if (m_Body.linearLockY == ArticulationDofLock.LimitedMotion)
{
drive = m_Body.yDrive;
limited = true;
}
else if (m_Body.linearLockZ == ArticulationDofLock.LimitedMotion)
{
drive = m_Body.zDrive;
limited = true;
}
var jointPos = m_Body.jointPosition[0];
if (limited)
{
// If locked, interpolate between the limits.
var upperLimit = drive.upperLimit;
var lowerLimit = drive.lowerLimit;
if (upperLimit <= lowerLimit)
{
// Invalid limits (probably equal), so don't try to lerp
return 0;
}
var invLerped = Mathf.InverseLerp(lowerLimit, upperLimit, jointPos);
// Convert [0, 1] -> [-1, 1]
var normalized = 2.0f * invLerped - 1.0f;
return normalized;
}
// take tanh() to keep in [-1, 1]
return (float) System.Math.Tanh(jointPos);
}
}
}
#endif

11
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs.meta


fileFormatVersion: 2
guid: 238d15f867b9c4ced9cef331b7420b27
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

27
com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs


using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Extensions.Sensors
{
/// <summary>
/// Interface for generating observations from a physical joint or constraint.
/// </summary>
public interface IJointExtractor
{
/// <summary>
/// Determine the number of observations that would be generated for the particular joint
/// using the provided PhysicsSensorSettings.
/// </summary>
/// <param name="settings"></param>
/// <returns>Number of floats that will be written.</returns>
int NumObservations(PhysicsSensorSettings settings);
/// <summary>
/// Write the observations to the ObservationWriter, starting at the specified offset.
/// </summary>
/// <param name="settings"></param>
/// <param name="writer"></param>
/// <param name="offset"></param>
/// <returns>Number of floats that were written.</returns>
int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset);
}
}

11
com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs.meta


fileFormatVersion: 2
guid: 2d2a01ea194334a4682d5c8cad4a956b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

62
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs


using System.Collections.Generic;
using UnityEngine;
using Unity.MLAgents.Sensors;
namespace Unity.MLAgents.Extensions.Sensors
{
public class RigidBodyJointExtractor : IJointExtractor
{
Rigidbody m_Body;
Joint m_Joint;
public RigidBodyJointExtractor(Rigidbody body)
{
m_Body = body;
m_Joint = m_Body?.GetComponent<Joint>();
}
public int NumObservations(PhysicsSensorSettings settings)
{
return NumObservations(m_Body, m_Joint, settings);
}
public static int NumObservations(Rigidbody body, Joint joint, PhysicsSensorSettings settings)
{
if(body == null || joint == null)
{
return 0;
}
var numObservations = 0;
if (settings.UseJointForces)
{
// 3 force and 3 torque values
numObservations += 6;
}
return numObservations;
}
public int Write(PhysicsSensorSettings settings, ObservationWriter writer, int offset)
{
if (m_Body == null || m_Joint == null)
{
return 0;
}
var currentOffset = offset;
if (settings.UseJointForces)
{
// Take tanh of the forces and torques to ensure they're in [-1, 1]
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.x);
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.y);
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentForce.z);
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.x);
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.y);
writer[currentOffset++] = (float)System.Math.Tanh(m_Joint.currentTorque.z);
}
return currentOffset - offset;
}
}
}

11
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs.meta


fileFormatVersion: 2
guid: 5014d7ab95c6a44469f447b8a7019746
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

0
ml-agents/mlagents/trainers/tf/__init__.py

285
ml-agents/mlagents/trainers/policy/nn_policy.py


from typing import Any, Dict, Optional, List
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
)
EPSILON = 1e-6 # Small value to avoid divide by zero
class NNPolicy(TFPolicy):
def __init__(
self,
seed: int,
behavior_spec: BehaviorSpec,
trainer_params: TrainerSettings,
is_training: bool,
model_path: str,
load: bool,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
create_tf_graph: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_params: Defined training parameters.
:param is_training: Whether the model should be trained.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param model_path: Path where the model should be saved and loaded.
:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
"""
super().__init__(seed, behavior_spec, trainer_params, model_path, load)
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = self.network_settings.num_layers
self.h_size = self.network_settings.hidden_units
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = self.network_settings.vis_encode_type
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
self.trainable_variables: List[tf.Variable] = []
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
if create_tf_graph:
self.create_tf_graph()
def get_trainable_variables(self) -> List[tf.Variable]:
"""
Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
returns empty list.
"""
return self.trainable_variables
def create_tf_graph(self) -> None:
"""
Builds the tensorflow graph needed for this policy.
"""
with self.graph.as_default():
tf.set_random_seed(self.seed)
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
if len(_vars) > 0:
# We assume the first thing created in the graph is the Policy. If
# already populated, don't create more tensors.
return
self.create_input_placeholders()
encoded = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.num_layers,
self.vis_encode_type,
)
if self.use_continuous_act:
self._create_cc_actor(
encoded,
self.tanh_squash,
self.reparameterize,
self.condition_sigma_on_obs,
)
self.saliency = tf.reduce_mean(
tf.square(tf.gradients(self.output, self.vector_in)), axis=1
)
else:
self._create_dc_actor(encoded)
self.saliency = tf.reduce_mean(
tf.square(tf.gradients(self.output_pre, self.vector_in)), axis=1
)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
) # LSTMs need to be root scope for Barracuda export
self.inference_dict: Dict[str, tf.Tensor] = {
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.memory_out
# We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
# it will re-load the full graph
self._initialize_graph()
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
) -> Dict[str, Any]:
"""
Evaluates policy for the agent experiences provided.
:param decision_requests: DecisionSteps object containing inputs.
:param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
:return: Outputs from network as defined by self.inference_dict.
"""
feed_dict = {
self.batch_size_ph: len(decision_requests),
self.sequence_length_ph: 1,
}
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def _create_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("policy"):
encoded = ModelUtils.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
return encoded
def _create_cc_actor(
self,
encoded: tf.Tensor,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy.
"""
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
with tf.variable_scope("policy"):
distribution = GaussianDistribution(
hidden_policy,
self.act_size,
reparameterize=reparameterize,
tanh_squash=tanh_squash,
condition_sigma=condition_sigma_on_obs,
)
if tanh_squash:
self.output_pre = distribution.sample
self.output = tf.identity(self.output_pre, name="action")
else:
self.output_pre = distribution.sample
# Clip and scale output to ensure actions are always within [-1, 1] range.
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(self.output)
self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
self.entropy = distribution.entropy
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(self, encoded: tf.Tensor) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
"""
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
hidden_policy,
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
else:
hidden_policy = encoded
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
with tf.variable_scope("policy"):
distribution = MultiCategoricalDistribution(
hidden_policy, self.act_size, self.action_masks
)
# It's important that we are able to feed_dict a value into this tensor to get the
# right one-hot encoding, so we can't do identity on it.
self.output = distribution.sample
self.all_log_probs = tf.identity(distribution.log_probs, name="action")
self.selected_actions = tf.stop_gradient(
distribution.sample_onehot
) # In discrete, these are onehot
self.entropy = distribution.entropy
self.total_log_probs = distribution.total_log_probs

/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py → /ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py

/ml-agents/mlagents/trainers/models.py → /ml-agents/mlagents/trainers/tf/models.py

/ml-agents/mlagents/trainers/distributions.py → /ml-agents/mlagents/trainers/tf/distributions.py

正在加载...
取消
保存