浏览代码

Merge branch 'master' into sensitivity

/sensitivity
Andrew Cohen 4 年前
当前提交
06e4356c
共有 64 个文件被更改,包括 977 次插入561 次删除
  1. 2
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs
  2. 12
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs
  3. 53
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs
  4. 30
      com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs
  5. 18
      com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs
  6. 2
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs
  7. 13
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs
  8. 34
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs
  9. 22
      com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs
  10. 5
      com.unity.ml-agents/CHANGELOG.md
  11. 2
      docs/Getting-Started.md
  12. 2
      docs/Training-Configuration-File.md
  13. 2
      docs/Training-on-Microsoft-Azure.md
  14. 4
      docs/Using-Docker.md
  15. 2
      docs/Using-Tensorboard.md
  16. 4
      docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md
  17. 2
      ml-agents-envs/setup.py
  18. 2
      ml-agents/mlagents/model_serialization.py
  19. 5
      ml-agents/mlagents/trainers/agent_processor.py
  20. 2
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  21. 2
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  22. 2
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  23. 29
      ml-agents/mlagents/trainers/env_manager.py
  24. 11
      ml-agents/mlagents/trainers/ghost/trainer.py
  25. 3
      ml-agents/mlagents/trainers/optimizer/optimizer.py
  26. 2
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  27. 166
      ml-agents/mlagents/trainers/policy/policy.py
  28. 334
      ml-agents/mlagents/trainers/policy/tf_policy.py
  29. 2
      ml-agents/mlagents/trainers/ppo/optimizer.py
  30. 21
      ml-agents/mlagents/trainers/ppo/trainer.py
  31. 4
      ml-agents/mlagents/trainers/sac/network.py
  32. 6
      ml-agents/mlagents/trainers/sac/optimizer.py
  33. 17
      ml-agents/mlagents/trainers/sac/trainer.py
  34. 12
      ml-agents/mlagents/trainers/settings.py
  35. 17
      ml-agents/mlagents/trainers/stats.py
  36. 2
      ml-agents/mlagents/trainers/tests/test_barracuda_converter.py
  37. 7
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  38. 2
      ml-agents/mlagents/trainers/tests/test_distributions.py
  39. 2
      ml-agents/mlagents/trainers/tests/test_models.py
  40. 17
      ml-agents/mlagents/trainers/tests/test_nn_policy.py
  41. 15
      ml-agents/mlagents/trainers/tests/test_ppo.py
  42. 6
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  43. 13
      ml-agents/mlagents/trainers/tests/test_sac.py
  44. 3
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  45. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  46. 3
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  47. 4
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  48. 9
      ml-agents/mlagents/trainers/trainer/trainer.py
  49. 36
      ml-agents/mlagents/trainers/trainer_controller.py
  50. 2
      ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
  51. 13
      ml-agents/mlagents/trainers/tf/models.py
  52. 2
      ml-agents/mlagents/trainers/tf/distributions.py
  53. 147
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs
  54. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyJointExtractor.cs.meta
  55. 27
      com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs
  56. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/IJointExtractor.cs.meta
  57. 62
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs
  58. 11
      com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyJointExtractor.cs.meta
  59. 0
      ml-agents/mlagents/trainers/tf/__init__.py
  60. 285
      ml-agents/mlagents/trainers/policy/nn_policy.py
  61. 0
      /ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
  62. 0
      /ml-agents/mlagents/trainers/tf/models.py
  63. 0
      /ml-agents/mlagents/trainers/tf/distributions.py

2
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodyPoseExtractor.cs


var t = go.transform;
return new Pose { rotation = t.rotation, position = t.position };
}
internal ArticulationBody[] Bodies => m_Bodies;
}
}
#endif // UNITY_2020_1_OR_NEWER

12
com.unity.ml-agents.extensions/Runtime/Sensors/ArticulationBodySensorComponent.cs


// TODO static method in PhysicsBodySensor?
// TODO only update PoseExtractor when body changes?
var poseExtractor = new ArticulationBodyPoseExtractor(RootBody);
var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
return new[] { numTransformObservations };
var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
var numJointObservations = 0;
// Start from i=1 to ignore the root
for (var i = 1; i < poseExtractor.Bodies.Length; i++)
{
numJointObservations += ArticulationBodyJointExtractor.NumObservations(
poseExtractor.Bodies[i], Settings
);
}
return new[] { numPoseObservations + numJointObservations };
}
}

53
com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsBodySensor.cs


string m_SensorName;
PoseExtractor m_PoseExtractor;
IJointExtractor[] m_JointExtractors;
PhysicsSensorSettings m_Settings;
/// <summary>

/// <param name="sensorName"></param>
public PhysicsBodySensor(Rigidbody rootBody, GameObject rootGameObject, PhysicsSensorSettings settings, string sensorName=null)
{
m_PoseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
var poseExtractor = new RigidBodyPoseExtractor(rootBody, rootGameObject);
m_PoseExtractor = poseExtractor;
var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
m_Shape = new[] { numTransformObservations };
var numJointExtractorObservations = 0;
var rigidBodies = poseExtractor.Bodies;
if (rigidBodies != null)
{
m_JointExtractors = new IJointExtractor[rigidBodies.Length - 1]; // skip the root
for (var i = 1; i < rigidBodies.Length; i++)
{
var jointExtractor = new RigidBodyJointExtractor(rigidBodies[i]);
numJointExtractorObservations += jointExtractor.NumObservations(settings);
m_JointExtractors[i - 1] = jointExtractor;
}
}
else
{
m_JointExtractors = new IJointExtractor[0];
}
var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
m_PoseExtractor = new ArticulationBodyPoseExtractor(rootBody);
var poseExtractor = new ArticulationBodyPoseExtractor(rootBody);
m_PoseExtractor = poseExtractor;
var numTransformObservations = settings.TransformSize(m_PoseExtractor.NumPoses);
m_Shape = new[] { numTransformObservations };
var numJointExtractorObservations = 0;
var articBodies = poseExtractor.Bodies;
if (articBodies != null)
{
m_JointExtractors = new IJointExtractor[articBodies.Length - 1]; // skip the root
for (var i = 1; i < articBodies.Length; i++)
{
var jointExtractor = new ArticulationBodyJointExtractor(articBodies[i]);
numJointExtractorObservations += jointExtractor.NumObservations(settings);
m_JointExtractors[i - 1] = jointExtractor;
}
}
else
{
m_JointExtractors = new IJointExtractor[0];
}
var numTransformObservations = m_PoseExtractor.GetNumPoseObservations(settings);
m_Shape = new[] { numTransformObservations + numJointExtractorObservations };
}
#endif

public int Write(ObservationWriter writer)
{
var numWritten = writer.WritePoses(m_Settings, m_PoseExtractor);
foreach (var jointExtractor in m_JointExtractors)
{
numWritten += jointExtractor.Write(m_Settings, writer, numWritten);
}
return numWritten;
}

30
com.unity.ml-agents.extensions/Runtime/Sensors/PhysicsSensorSettings.cs


public bool UseLocalSpaceLinearVelocity;
/// <summary>
/// Whether to use joint-specific positions and angles as observations.
/// </summary>
public bool UseJointPositionsAndAngles;
/// <summary>
/// Whether to use the joint forces and torques that are applied by the solver as observations.
/// </summary>
public bool UseJointForces;
/// <summary>
/// Creates a PhysicsSensorSettings with reasonable default values.
/// </summary>
/// <returns></returns>

public bool UseLocalSpace
{
get { return UseLocalSpaceTranslations || UseLocalSpaceRotations || UseLocalSpaceLinearVelocity; }
}
/// <summary>
/// The number of floats needed to represent a given number of transforms.
/// </summary>
/// <param name="numTransforms"></param>
/// <returns></returns>
public int TransformSize(int numTransforms)
{
int obsPerTransform = 0;
obsPerTransform += UseModelSpaceTranslations ? 3 : 0;
obsPerTransform += UseModelSpaceRotations ? 4 : 0;
obsPerTransform += UseLocalSpaceTranslations ? 3 : 0;
obsPerTransform += UseLocalSpaceRotations ? 4 : 0;
obsPerTransform += UseModelSpaceLinearVelocity ? 3 : 0;
obsPerTransform += UseLocalSpaceLinearVelocity ? 3 : 0;
return numTransforms * obsPerTransform;
}
}

18
com.unity.ml-agents.extensions/Runtime/Sensors/PoseExtractor.cs


}
}
/// <summary>
/// Compute the number of floats needed to represent the poses for the given PhysicsSensorSettings.
/// </summary>
/// <param name="settings"></param>
/// <returns></returns>
public int GetNumPoseObservations(PhysicsSensorSettings settings)
{
int obsPerPose = 0;
obsPerPose += settings.UseModelSpaceTranslations ? 3 : 0;
obsPerPose += settings.UseModelSpaceRotations ? 4 : 0;
obsPerPose += settings.UseLocalSpaceTranslations ? 3 : 0;
obsPerPose += settings.UseLocalSpaceRotations ? 4 : 0;
obsPerPose += settings.UseModelSpaceLinearVelocity ? 3 : 0;
obsPerPose += settings.UseLocalSpaceLinearVelocity ? 3 : 0;
return NumPoses * obsPerPose;
}
internal void DrawModelSpace(Vector3 offset)
{

2
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodyPoseExtractor.cs


var body = m_Bodies[index];
return new Pose { rotation = body.rotation, position = body.position };
}
internal Rigidbody[] Bodies => m_Bodies;
}
}

13
com.unity.ml-agents.extensions/Runtime/Sensors/RigidBodySensorComponent.cs


// TODO static method in PhysicsBodySensor?
// TODO only update PoseExtractor when body changes?
var poseExtractor = new RigidBodyPoseExtractor(RootBody, gameObject);
var numTransformObservations = Settings.TransformSize(poseExtractor.NumPoses);
return new[] { numTransformObservations };
var numPoseObservations = poseExtractor.GetNumPoseObservations(Settings);
var numJointObservations = 0;
// Start from i=1 to ignore the root
for (var i = 1; i < poseExtractor.Bodies.Length; i++)
{
var body = poseExtractor.Bodies[i];
var joint = body?.GetComponent<Joint>();
numJointObservations += RigidBodyJointExtractor.NumObservations(body, joint, Settings);
}
return new[] { numPoseObservations + numJointObservations };
}
}

34
com.unity.ml-agents.extensions/Tests/Editor/Sensors/ArticulationBodySensorTests.cs


0f, 0f, 0f, 1f // LocalSpaceRotations
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
[Test]

var leafArticBody = leafGameObj.AddComponent<ArticulationBody>();
leafGameObj.transform.SetParent(middleGamObj.transform);
leafGameObj.transform.localPosition = new Vector3(4.2f, 0f, 0f);
leafArticBody.jointType = ArticulationJointType.RevoluteJoint;
leafArticBody.jointType = ArticulationJointType.PrismaticJoint;
leafArticBody.linearLockZ = ArticulationDofLock.LimitedMotion;
leafArticBody.zDrive = new ArticulationDrive
{
lowerLimit = -3,
upperLimit = 1
};
#if UNITY_2020_2_OR_NEWER
// ArticulationBody.velocity is read-only in 2020.1

#endif
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
// Update the settings to only process joint observations
sensorComponent.Settings = new PhysicsSensorSettings
{
UseJointForces = true,
UseJointPositionsAndAngles = true,
};
sensor = sensorComponent.CreateSensor();
sensor.Update();
expected = new[]
{
// revolute
0f, 1f, // joint1.position (sin and cos)
0f, // joint1.force
// prismatic
0.5f, // joint2.position (interpolate between limits)
0f, // joint2.force
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
}
}

22
com.unity.ml-agents.extensions/Tests/Editor/Sensors/RigidBodySensorTests.cs


0f, 0f, 0f, 1f // LocalSpaceRotations
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
[Test]

0f, -1f, 1f // Leaf vel
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
// Update the settings to only process joint observations
sensorComponent.Settings = new PhysicsSensorSettings
{
UseJointPositionsAndAngles = true,
UseJointForces = true,
};
sensor = sensorComponent.CreateSensor();
sensor.Update();
expected = new[]
{
0f, 0f, 0f, // joint1.force
0f, 0f, 0f, // joint1.torque
0f, 0f, 0f, // joint2.force
0f, 0f, 0f, // joint2.torque
};
SensorTestHelper.CompareObservation(sensor, expected);
Assert.AreEqual(expected.Length, sensorComponent.GetObservationShape()[0]);
}
}

5
com.unity.ml-agents/CHANGELOG.md


### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
- The minimum supported python version for ml-agents-envs was changed to 3.6.1. (#4244)
- The interaction between EnvManager and TrainerController was changed; EnvManager.advance() was split into to stages,
and TrainerController now uses the results from the first stage to handle new behavior names. This change speeds up
Python training by approximately 5-10%. (#4259)
### Minor Changes
#### com.unity.ml-agents (C#)

2
docs/Getting-Started.md


TensorBoard. From the command line run:
```sh
tensorboard --logdir=results
tensorboard --logdir results
```
Then navigate to `localhost:6006` in your browser to view the TensorBoard

2
docs/Training-Configuration-File.md


| `hyperparameters -> beta` | (default = `5.0e-3`) Strength of the entropy regularization, which makes the policy "more random." This ensures that agents properly explore the action space during training. Increasing this will ensure more random actions are taken. This should be adjusted such that the entropy (measurable from TensorBoard) slowly decreases alongside increases in reward. If entropy drops too quickly, increase beta. If entropy drops too slowly, decrease `beta`. <br><br>Typical range: `1e-4` - `1e-2` |
| `hyperparameters -> epsilon` | (default = `0.2`) Influences how rapidly the policy can evolve during training. Corresponds to the acceptable threshold of divergence between the old and new policies during gradient descent updating. Setting this value small will result in more stable updates, but will also slow the training process. <br><br>Typical range: `0.1` - `0.3` |
| `hyperparameters -> lambd` | (default = `0.95`) Regularization parameter (lambda) used when calculating the Generalized Advantage Estimate ([GAE](https://arxiv.org/abs/1506.02438)). This can be thought of as how much the agent relies on its current value estimate when calculating an updated value estimate. Low values correspond to relying more on the current value estimate (which can be high bias), and high values correspond to relying more on the actual rewards received in the environment (which can be high variance). The parameter provides a trade-off between the two, and the right value can lead to a more stable training process. <br><br>Typical range: `0.9` - `0.95` |
| `hyperparameters -> num_epoch` | Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
| `hyperparameters -> num_epoch` | (default = `3`) Number of passes to make through the experience buffer when performing gradient descent optimization.The larger the batch_size, the larger it is acceptable to make this. Decreasing this will ensure more stable updates, at the cost of slower learning. <br><br>Typical range: `3` - `10` |
### SAC-specific Configurations

2
docs/Training-on-Microsoft-Azure.md


2. Unless you started the training as a background process, connect to your VM
from another terminal instance.
3. Run the following command from your terminal
`tensorboard --logdir=summaries --host 0.0.0.0`
`tensorboard --logdir results --host 0.0.0.0`
4. You should now be able to open a browser and navigate to
`<Your_VM_IP_Address>:6060` to view the TensorBoard report.

4
docs/Using-Docker.md


http://localhost:6006:
```sh
docker exec -it <container-name> tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
docker exec -it <container-name> tensorboard --logdir /unity-volume/results --host 0.0.0.0
docker exec -it 3DBallContainer.first.trial tensorboard --logdir=/unity-volume/summaries --host=0.0.0.0
docker exec -it 3DBallContainer.first.trial tensorboard --logdir /unity-volume/results --host 0.0.0.0
```
For more details on Tensorboard, check out the documentation about

2
docs/Using-Tensorboard.md


1. Open a terminal or console window:
1. Navigate to the directory where the ML-Agents Toolkit is installed.
1. From the command line run: `tensorboard --logdir=results --port=6006`
1. From the command line run: `tensorboard --logdir results --port 6006`
1. Open a browser window and navigate to
[localhost:6006](http://localhost:6006).

4
docs/localized/zh-CN/docs/Getting-Started-with-Balance-Ball.md


### 观测训练进度
开始使用 `learn.py` 按照前面部分所述的方式进行训练后,`ml-agents` 文件夹将
包含一个 `summaries` 目录。为了更详细地观测训练过程,
包含一个 `results` 目录。为了更详细地观测训练过程,
`tensorboard --logdir=summaries`
`tensorboard --logdir results`
然后导航至 `localhost:6006`

2
ml-agents-envs/setup.py


install_requires=[
"cloudpickle",
"grpcio>=1.11.0",
"numpy>=1.14.1,<2.0",
"numpy>=1.14.1,<1.19.0",
"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml>=3.1.0",

2
ml-agents/mlagents/model_serialization.py


from tensorflow.python.framework import graph_util
from mlagents_envs.logging_util import get_logger
from mlagents.trainers import tensorflow_to_barracuda as tf2bc
from mlagents.trainers.tf import tensorflow_to_barracuda as tf2bc
if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
# ONNX is only tested on 1.12.0 and later

5
ml-agents/mlagents/trainers/agent_processor.py


EnvironmentStats,
)
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.stats import StatsReporter

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

2
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


"""
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)
:param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
:param policy: The Policy object (e.g. TFPolicy) that this Reward Signal will apply to.
:param settings: Settings parameters for this Reward Signal, including gamma and strength.
:return: A RewardSignal object.
"""

2
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


from typing import List, Tuple
from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy

2
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


from mlagents.tf_utils import tf
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
EPSILON = 1e-7

29
ml-agents/mlagents/trainers/env_manager.py


)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.logging_util import get_logger

class EnvManager(ABC):
def __init__(self):
self.policies: Dict[BehaviorName, TFPolicy] = {}
self.policies: Dict[BehaviorName, Policy] = {}
self.first_step_infos: List[EnvironmentStep] = None
self.first_step_infos: List[EnvironmentStep] = []
def set_policy(self, brain_name: BehaviorName, policy: TFPolicy) -> None:
def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None:
self.policies[brain_name] = policy
if brain_name in self.agent_managers:
self.agent_managers[brain_name].policy = policy

def close(self):
pass
def advance(self):
def get_steps(self) -> List[EnvironmentStep]:
"""
Updates the policies, steps the environments, and returns the step information from the environments.
Calling code should pass the returned EnvironmentSteps to process_steps() after calling this.
:return: The list of EnvironmentSteps
"""
if self.first_step_infos is not None:
if self.first_step_infos:
self.first_step_infos = None
self.first_step_infos = []
for brain_name in self.training_behaviors:
for brain_name in self.agent_managers.keys():
_policy = None
try:
# We make sure to empty the policy queue before continuing to produce steps.

except AgentManagerQueue.Empty:
if _policy is not None:
self.set_policy(brain_name, _policy)
# Step the environment
# policy_queue contains Policy, but we need a TFPolicy here
self.set_policy(brain_name, _policy) # type: ignore
# Step the environments
return new_step_infos
def process_steps(self, new_step_infos: List[EnvironmentStep]) -> int:
# Add to AgentProcessor
num_step_infos = self._process_step_infos(new_step_infos)
return num_step_infos

11
ml-agents/mlagents/trainers/ghost/trainer.py


# ## ML-Agent Learning (Ghost Trainer)
from collections import defaultdict
from typing import Deque, Dict, DefaultDict, List, cast
from typing import Deque, Dict, DefaultDict, List
import numpy as np

from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trajectory import Trajectory

for brain_name in self._internal_policy_queues:
internal_policy_queue = self._internal_policy_queues[brain_name]
try:
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
policy = internal_policy_queue.get_nowait()
self.current_policy_snapshot[brain_name] = policy.get_weights()
except AgentManagerQueue.Empty:
pass

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
) -> Policy:
"""
Creates policy with the wrapped trainer's create_policy function
The first policy encountered sets the wrapped

return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to GhostTrainer.

self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
self.policies[name_behavior_id] = policy
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy associated with name_behavior_id
:param name_behavior_id: Fully qualified behavior name

3
ml-agents/mlagents/trainers/optimizer/optimizer.py


Provides methods to update the Policy.
"""
def __init__(self):
self.reward_signals = {}
@abc.abstractmethod
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""

2
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


class TFOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
super().__init__()
self.sess = policy.sess
self.policy = policy
self.update_dict: Dict[str, tf.Tensor] = {}

Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later

166
ml-agents/mlagents/trainers/policy/policy.py


from abc import ABC, abstractmethod
from abc import abstractmethod
from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.exception import UnityException
from mlagents.model_serialization import SerializationSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
class Policy(ABC):
@abstractmethod
class UnityPolicyException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class Policy:
def __init__(
self,
seed: int,
behavior_spec: BehaviorSpec,
trainer_settings: TrainerSettings,
model_path: str,
load: bool = False,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
):
self.behavior_spec = behavior_spec
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()
else [behavior_spec.action_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.model_path = model_path
self.initialize_path = self.trainer_settings.init_path
self._keep_checkpoints = self.trainer_settings.keep_checkpoints
self.use_continuous_act = behavior_spec.is_action_continuous()
self.num_branches = self.behavior_spec.action_size
self.previous_action_dict: Dict[str, np.array] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None
self.load = load
self.h_size = self.network_settings.hidden_units
num_layers = self.network_settings.num_layers
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = self.network_settings.vis_encode_type
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
self.m_size = 0
self.sequence_length = 1
if self.network_settings.memory is not None:
self.m_size = self.network_settings.memory.memory_size
self.sequence_length = self.network_settings.memory.sequence_length
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
raise NotImplementedError
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
pass
@abstractmethod
def increment_step(self, n_steps):
pass
@abstractmethod
def get_current_step(self):
pass
@abstractmethod
def checkpoint(self, checkpoint_path: str, settings: SerializationSettings) -> None:
pass
@abstractmethod
def save(self, output_filepath: str, settings: SerializationSettings) -> None:
pass
@abstractmethod
def load_weights(self, values: List[np.ndarray]) -> None:
pass
@abstractmethod
def get_weights(self) -> List[np.ndarray]:
return []
@abstractmethod
def init_load_weights(self) -> None:
pass

334
ml-agents/mlagents/trainers/policy/tf_policy.py


from typing import Any, Dict, List, Optional, Tuple
import abc
from mlagents_envs.timers import timed
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents.tf_utils import tf

from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers.tf.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
)
logger = get_logger(__name__)

# determines compatibility with inference in Barracuda.
MODEL_FORMAT_VERSION = 2
EPSILON = 1e-6 # Small value to avoid divide by zero
class UnityPolicyException(UnityException):
"""

trainer_settings: TrainerSettings,
model_path: str,
load: bool = False,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
create_tf_graph: bool = True,
):
"""
Initialized the policy.

:param model_path: Where to load/save the model.
:param load: If True, load model from model_path. Otherwise, create new model.
"""
self.m_size = 0
self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
super().__init__(
seed,
behavior_spec,
trainer_settings,
model_path,
load,
tanh_squash,
reparameterize,
condition_sigma_on_obs,
)
self.inference_dict: Dict[str, tf.Tensor] = {}
self.sequence_length = 1
self.seed = seed
self.behavior_spec = behavior_spec
self.act_size = (
list(behavior_spec.discrete_action_branches)
if behavior_spec.is_action_discrete()
else [behavior_spec.action_size]
)
self.vec_obs_size = sum(
shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
)
self.vis_obs_size = sum(
1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.inference_dict: Dict[str, tf.Tensor] = {}
self.use_recurrent = self.network_settings.memory is not None
self.memory_dict: Dict[str, np.ndarray] = {}
self.num_branches = self.behavior_spec.action_size
self.previous_action_dict: Dict[str, np.array] = {}
self.normalize = self.network_settings.normalize
self.use_continuous_act = behavior_spec.is_action_continuous()
self.model_path = model_path
self.initialize_path = self.trainer_settings.init_path
self.keep_checkpoints = self.trainer_settings.keep_checkpoints
self.seed = seed
if self.network_settings.memory is not None:
self.m_size = self.network_settings.memory.memory_size
self.sequence_length = self.network_settings.memory.sequence_length
self.load = load
self.grads = None
self.update_batch: Optional[tf.Operation] = None
self.trainable_variables: List[tf.Variable] = []
if create_tf_graph:
self.create_tf_graph()
@abc.abstractmethod
pass
return self.trainable_variables
@abc.abstractmethod
def create_tf_graph(self):
def create_tf_graph(self) -> None:
pass
with self.graph.as_default():
tf.set_random_seed(self.seed)
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
if len(_vars) > 0:
# We assume the first thing created in the graph is the Policy. If
# already populated, don't create more tensors.
return
self.create_input_placeholders()
encoded = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.num_layers,
self.vis_encode_type,
)
if self.use_continuous_act:
self._create_cc_actor(
encoded,
self.tanh_squash,
self.reparameterize,
self.condition_sigma_on_obs,
)
else:
self._create_dc_actor(encoded)
self.saliency = tf.reduce_mean(
tf.square(tf.gradients(self.output, self.vector_in)), axis=1
)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
) # LSTMs need to be root scope for Barracuda export
self.inference_dict = {
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.memory_out
# We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
# it will re-load the full graph
self._initialize_graph()
def _create_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("policy"):
encoded = ModelUtils.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
return encoded
@staticmethod
def _convert_version_string(version_string: str) -> Tuple[int, ...]:

def _initialize_graph(self):
with self.graph.as_default():
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
self.saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
logger.info(f"Loading model from {model_path}.")
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt is None:

feed_dict[assign_ph] = value
self.sess.run(self.assign_ops, feed_dict=feed_dict)
@timed
:param decision_requests: DecisionSteps input to network.
:return: Output from policy based on self.inference_dict.
:param decision_requests: DecisionSteps object containing inputs.
:param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
:return: Outputs from network as defined by self.inference_dict.
raise UnityPolicyException("The evaluate function was not implemented.")
feed_dict = {
self.batch_size_ph: len(decision_requests),
self.sequence_length_ph: 1,
}
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0

mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
feed_dict[self.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
def get_current_step(self):
"""

trainable=False,
dtype=tf.int32,
)
def _create_cc_actor(
self,
encoded: tf.Tensor,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy.
"""
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
with tf.variable_scope("policy"):
distribution = GaussianDistribution(
hidden_policy,
self.act_size,
reparameterize=reparameterize,
tanh_squash=tanh_squash,
condition_sigma=condition_sigma_on_obs,
)
if tanh_squash:
self.output_pre = distribution.sample
self.output = tf.identity(self.output_pre, name="action")
else:
self.output_pre = distribution.sample
# Clip and scale output to ensure actions are always within [-1, 1] range.
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(self.output)
self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
self.entropy = distribution.entropy
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(self, encoded: tf.Tensor) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
"""
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
hidden_policy,
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
else:
hidden_policy = encoded
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
with tf.variable_scope("policy"):
distribution = MultiCategoricalDistribution(
hidden_policy, self.act_size, self.action_masks
)
# It's important that we are able to feed_dict a value into this tensor to get the
# right one-hot encoding, so we can't do identity on it.
self.output = distribution.sample
self.all_log_probs = tf.identity(distribution.log_probs, name="action")
self.selected_actions = tf.stop_gradient(
distribution.sample_onehot
) # In discrete, these are onehot
self.entropy = distribution.entropy
self.total_log_probs = distribution.total_log_probs

2
ml-agents/mlagents/trainers/ppo/optimizer.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.tf.models import ModelUtils, EncoderType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer

21
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory

)
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""

:param behavior_spec: specifications for policy construction
:return policy
"""
policy = NNPolicy(
policy = TFPolicy(
self.is_training,
self.artifact_path,
self.load,
model_path=self.artifact_path,
load=self.load,
condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
)

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

4
ml-agents/mlagents/trainers/sac/network.py


from typing import Dict, Optional
from mlagents.tf_utils import tf
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import EncoderType
LOG_STD_MAX = 2
LOG_STD_MIN = -20

self._create_memory_ins(m_size)
hidden_critic = self._create_observation_in(vis_encode_type)
self.policy.output = self.policy.output
# Use the sequence length of the policy
self.sequence_length_ph = self.policy.sequence_length_ph

6
ml-agents/mlagents/trainers/sac/optimizer.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer

# Non-exposed SAC parameters
self.discrete_target_entropy_scale = (
0.2
) # Roughly equal to e-greedy 0.05
0.2 # Roughly equal to e-greedy 0.05
)
self.continuous_target_entropy_scale = 1.0
stream_names = list(self.reward_signals.keys())

17
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations

self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
policy = NNPolicy(
policy = TFPolicy(
self.is_training,
self.artifact_path,
self.load,
tanh_squash=True,

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

max(1, self.step / self.reward_signal_steps_per_update)
)