浏览代码

Develop hybrid action staging (#4702)

Co-authored-by: Ervin T <ervin@unity3d.com>
Co-authored-by: Vincent-Pierre BERGES <vincentpierre@unity3d.com>
Co-authored-by: Ruo-Ping Dong <ruoping.dong@unity3d.com>
Co-authored-by: Chris Elion <chris.elion@unity3d.com>
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
990f801a
共有 118 个文件被更改,包括 2358 次插入1618 次删除
  1. 1
      Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  2. 7
      com.unity.ml-agents/Runtime/Academy.cs
  3. 8
      com.unity.ml-agents/Runtime/Actuators/ActionSegment.cs
  4. 12
      com.unity.ml-agents/Runtime/Actuators/ActionSpec.cs
  5. 4
      com.unity.ml-agents/Runtime/Actuators/ActuatorManager.cs
  6. 96
      com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs
  7. 34
      com.unity.ml-agents/Runtime/Agent.cs
  8. 10
      com.unity.ml-agents/Runtime/Agent.deprecated.cs
  9. 77
      com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs
  10. 2
      com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs
  11. 13
      com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
  12. 13
      com.unity.ml-agents/Runtime/Communicator/UnityRLCapabilities.cs
  13. 82
      com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentAction.cs
  14. 348
      com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/BrainParameters.cs
  15. 44
      com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/Capabilities.cs
  16. 44
      com.unity.ml-agents/Runtime/Inference/ApplierImpl.cs
  17. 237
      com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs
  18. 4
      com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs
  19. 12
      com.unity.ml-agents/Runtime/Inference/ModelRunner.cs
  20. 35
      com.unity.ml-agents/Runtime/Inference/TensorApplier.cs
  21. 26
      com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs
  22. 15
      com.unity.ml-agents/Runtime/Inference/TensorNames.cs
  23. 19
      com.unity.ml-agents/Runtime/Policies/BarracudaPolicy.cs
  24. 14
      com.unity.ml-agents/Runtime/Policies/RemotePolicy.cs
  25. 12
      com.unity.ml-agents/Tests/Editor/Actuators/ActuatorManagerTests.cs
  26. 3
      com.unity.ml-agents/Tests/Editor/DemonstrationTests.cs
  27. 74
      com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs
  28. 7
      com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorGenerator.cs
  29. 62
      com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs
  30. 212
      com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs
  31. 2
      com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action_deprecated.nn.meta
  32. 2
      com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr_deprecated.nn.meta
  33. 8
      docs/Getting-Started.md
  34. 15
      docs/Learning-Environment-Create-New.md
  35. 80
      docs/Learning-Environment-Design-Agents.md
  36. 64
      docs/Python-API.md
  37. 4
      docs/Training-Configuration-File.md
  38. 10
      gym-unity/gym_unity/envs/__init__.py
  39. 148
      ml-agents-envs/mlagents_envs/base_env.py
  40. 22
      ml-agents-envs/mlagents_envs/communicator_objects/agent_action_pb2.py
  41. 12
      ml-agents-envs/mlagents_envs/communicator_objects/agent_action_pb2.pyi
  42. 82
      ml-agents-envs/mlagents_envs/communicator_objects/brain_parameters_pb2.py
  43. 45
      ml-agents-envs/mlagents_envs/communicator_objects/brain_parameters_pb2.pyi
  44. 13
      ml-agents-envs/mlagents_envs/communicator_objects/capabilities_pb2.py
  45. 6
      ml-agents-envs/mlagents_envs/communicator_objects/capabilities_pb2.pyi
  46. 30
      ml-agents-envs/mlagents_envs/environment.py
  47. 18
      ml-agents-envs/mlagents_envs/mock_communicator.py
  48. 23
      ml-agents-envs/mlagents_envs/rpc_utils.py
  49. 6
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  50. 33
      ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
  51. 27
      ml-agents-envs/mlagents_envs/tests/test_steps.py
  52. 3
      ml-agents/mlagents/trainers/action_info.py
  53. 23
      ml-agents/mlagents/trainers/agent_processor.py
  54. 2
      ml-agents/mlagents/trainers/buffer.py
  55. 18
      ml-agents/mlagents/trainers/demo_loader.py
  56. 1
      ml-agents/mlagents/trainers/env_manager.py
  57. 4
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  58. 40
      ml-agents/mlagents/trainers/policy/policy.py
  59. 33
      ml-agents/mlagents/trainers/policy/tf_policy.py
  60. 77
      ml-agents/mlagents/trainers/policy/torch_policy.py
  61. 9
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  62. 13
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  63. 6
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  64. 284
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  65. 2
      ml-agents/mlagents/trainers/simple_env_manager.py
  66. 4
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  67. 24
      ml-agents/mlagents/trainers/tests/mock_brain.py
  68. 82
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  69. 66
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  70. 128
      ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
  71. 12
      ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
  72. 41
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  73. 10
      ml-agents/mlagents/trainers/tests/test_demo_loader.py
  74. 2
      ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
  75. 7
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  76. 13
      ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py
  77. 2
      ml-agents/mlagents/trainers/tests/torch/test_distributions.py
  78. 90
      ml-agents/mlagents/trainers/tests/torch/test_networks.py
  79. 28
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  80. 15
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py
  81. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  82. 11
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  83. 3
      ml-agents/mlagents/trainers/tests/torch/test_sac.py
  84. 132
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  85. 44
      ml-agents/mlagents/trainers/tests/torch/test_utils.py
  86. 6
      ml-agents/mlagents/trainers/tf/components/bc/module.py
  87. 10
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
  88. 17
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
  89. 52
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  90. 78
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  91. 8
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  92. 21
      ml-agents/mlagents/trainers/torch/distributions.py
  93. 31
      ml-agents/mlagents/trainers/torch/model_serialization.py
  94. 226
      ml-agents/mlagents/trainers/torch/networks.py
  95. 48
      ml-agents/mlagents/trainers/torch/utils.py
  96. 28
      ml-agents/mlagents/trainers/trajectory.py
  97. 22
      ml-agents/tests/yamato/scripts/run_llapi.py
  98. 4
      protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_action.proto
  99. 14
      protobuf-definitions/proto/mlagents_envs/communicator_objects/brain_parameters.proto
  100. 3
      protobuf-definitions/proto/mlagents_envs/communicator_objects/capabilities.proto

1
Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs


/// The agent's four actions correspond to torques on each of the two joints.
/// </summary>
public override void OnActionReceived(ActionBuffers actionBuffers)
{
m_GoalDegree += m_GoalSpeed;
UpdateGoalPosition();

7
com.unity.ml-agents/Runtime/Academy.cs


/// <term>1.2.0</term>
/// <description>Support compression mapping for stacked compressed observations.</description>
/// </item>
/// <item>
/// <term>1.3.0</term>
/// <description>Support action spaces with both continuous and discrete actions.</description>
/// </item>
const string k_ApiVersion = "1.2.0";
const string k_ApiVersion = "1.3.0";
/// <summary>
/// Unity package version of com.unity.ml-agents.

Dispose();
}
}
#endif
/// <summary>

8
com.unity.ml-agents/Runtime/Actuators/ActionSegment.cs


System.Array.Clear(Array, Offset, Length);
}
/// <summary>
/// Check if the segment is empty.
/// </summary>
public bool IsEmpty()
{
return Array == null || Array.Length == 0;
}
/// <inheritdoc/>
IEnumerator<T> IEnumerable<T>.GetEnumerator()
{

12
com.unity.ml-agents/Runtime/Actuators/ActionSpec.cs


/// </summary>
public readonly struct ActionSpec
{
/// <summary>
/// An array of branch sizes for our action space.
///

}
/// <summary>
/// Temporary check that the ActionSpec uses either all continuous or all discrete actions.
/// This should be removed once the trainer supports them.
/// Check that the ActionSpec uses either all continuous or all discrete actions.
/// This is only used when connecting to old versions of the trainer that don't support this.
internal void CheckNotHybrid()
internal void CheckAllContinuousOrDiscrete()
throw new UnityAgentsException("ActionSpecs must be all continuous or all discrete.");
throw new UnityAgentsException(
"Action spaces with both continuous and discrete actions are not supported by the trainer. " +
"ActionSpecs must be all continuous or all discrete."
);
}
}
}

4
com.unity.ml-agents/Runtime/Actuators/ActuatorManager.cs


Debug.Assert(
!m_Actuators[i].Name.Equals(m_Actuators[i + 1].Name),
"Actuator names must be unique.");
var first = m_Actuators[i].ActionSpec;
var second = m_Actuators[i + 1].ActionSpec;
Debug.Assert(first.NumContinuousActions > 0 == second.NumContinuousActions > 0,
"Actuators on the same Agent must have the same action SpaceType.");
}
}

96
com.unity.ml-agents/Runtime/Actuators/IActionReceiver.cs


}
/// <summary>
/// Construct an <see cref="ActionBuffers"/> instance with <see cref="ActionSpec"/>. All values are initialized to zeros.
/// /// </summary>
/// <param name="actionSpec">The <see cref="ActionSpec"/> to send to an <see cref="IActionReceiver"/>.</param>
public ActionBuffers(ActionSpec actionSpec)
: this(new ActionSegment<float>(new float[actionSpec.NumContinuousActions]),
new ActionSegment<int>(new int[actionSpec.NumDiscreteActions]))
{ }
/// <summary>
/// Create an <see cref="ActionBuffers"/> instance with ActionSpec and all actions stored as a float array.
/// </summary>
/// <param name="actionSpec"><see cref="ActionSpec"/> of the <see cref="ActionBuffers"/></param>
/// <param name="actions">The float array of all actions, including discrete and continuous actions.</param>
/// <returns>An <see cref="ActionBuffers"/> instance initialized with a <see cref="ActionSpec"/> and a float array.
internal static ActionBuffers FromActionSpec(ActionSpec actionSpec, float[] actions)
{
if (actions == null)
{
return ActionBuffers.Empty;
}
Debug.Assert(actions.Length == actionSpec.NumContinuousActions + actionSpec.NumDiscreteActions,
$"The length of '{nameof(actions)}' does not match the total size of ActionSpec.\n" +
$"{nameof(actions)}.Length: {actions.Length}\n" +
$"{nameof(actionSpec)}: {actionSpec.NumContinuousActions + actionSpec.NumDiscreteActions}");
ActionSegment<float> continuousActionSegment = ActionSegment<float>.Empty;
ActionSegment<int> discreteActionSegment = ActionSegment<int>.Empty;
int offset = 0;
if (actionSpec.NumContinuousActions > 0)
{
continuousActionSegment = new ActionSegment<float>(actions, 0, actionSpec.NumContinuousActions);
offset += actionSpec.NumContinuousActions;
}
if (actionSpec.NumDiscreteActions > 0)
{
int[] discreteActions = new int[actionSpec.NumDiscreteActions];
for (var i = 0; i < actionSpec.NumDiscreteActions; i++)
{
discreteActions[i] = (int)actions[i + offset];
}
discreteActionSegment = new ActionSegment<int>(discreteActions);
}
return new ActionBuffers(continuousActionSegment, discreteActionSegment);
}
/// <summary>
/// Clear the <see cref="ContinuousActions"/> and <see cref="DiscreteActions"/> segments to be all zeros.
/// </summary>
public void Clear()

}
/// <summary>
/// Check if the <see cref="ActionBuffers"/> is empty.
/// </summary>
public bool IsEmpty()
{
return ContinuousActions.IsEmpty() && DiscreteActions.IsEmpty();
}
/// <inheritdoc/>
public override bool Equals(object obj)
{

unchecked
{
return (ContinuousActions.GetHashCode() * 397) ^ DiscreteActions.GetHashCode();
}
}
/// <summary>
/// Packs the continuous and discrete actions into one float array. The array passed into this method
/// must have a Length that is greater than or equal to the sum of the Lengths of
/// <see cref="ContinuousActions"/> and <see cref="DiscreteActions"/>.
/// </summary>
/// <param name="destination">A float array to pack actions into whose length is greater than or
/// equal to the addition of the Lengths of this objects <see cref="ContinuousActions"/> and
/// <see cref="DiscreteActions"/> segments.</param>
public void PackActions(in float[] destination)
{
Debug.Assert(destination.Length >= ContinuousActions.Length + DiscreteActions.Length,
$"argument '{nameof(destination)}' is not large enough to pack the actions into.\n" +
$"{nameof(destination)}.Length: {destination.Length}\n" +
$"{nameof(ContinuousActions)}.Length + {nameof(DiscreteActions)}.Length: {ContinuousActions.Length + DiscreteActions.Length}");
var start = 0;
if (ContinuousActions.Length > 0)
{
Array.Copy(ContinuousActions.Array,
ContinuousActions.Offset,
destination,
start,
ContinuousActions.Length);
start = ContinuousActions.Length;
}
if (start >= destination.Length)
{
return;
}
if (DiscreteActions.Length > 0)
{
Array.Copy(DiscreteActions.Array,
DiscreteActions.Offset,
destination,
start,
DiscreteActions.Length);
}
}
}

34
com.unity.ml-agents/Runtime/Agent.cs


/// <summary>
/// Keeps track of the last vector action taken by the Brain.
/// </summary>
public float[] storedVectorActions;
public ActionBuffers storedVectorActions;
/// <summary>
/// For discrete control, specifies the actions that the agent cannot take.

public void ClearActions()
{
Array.Clear(storedVectorActions, 0, storedVectorActions.Length);
storedVectorActions.Clear();
actionBuffers.PackActions(storedVectorActions);
var continuousActions = storedVectorActions.ContinuousActions;
for (var i = 0; i < actionBuffers.ContinuousActions.Length; i++)
{
continuousActions[i] = actionBuffers.ContinuousActions[i];
}
var discreteActions = storedVectorActions.DiscreteActions;
for (var i = 0; i < actionBuffers.DiscreteActions.Length; i++)
{
discreteActions[i] = actionBuffers.DiscreteActions[i];
}
}
}

InitializeSensors();
}
m_Info.storedVectorActions = new float[m_ActuatorManager.TotalNumberOfActions];
m_Info.storedVectorActions = new ActionBuffers(
new float[m_ActuatorManager.NumContinuousActions],
new int[m_ActuatorManager.NumDiscreteActions]
);
// The first time the Academy resets, all Agents in the scene will be
// forced to reset through the <see cref="AgentForceReset"/> event.

m_CumulativeReward = 0f;
m_RequestAction = false;
m_RequestDecision = false;
Array.Clear(m_Info.storedVectorActions, 0, m_Info.storedVectorActions.Length);
m_Info.storedVectorActions.Clear();
}
/// <summary>

}
else
{
m_ActuatorManager.StoredActions.PackActions(m_Info.storedVectorActions);
m_Info.CopyActions(m_ActuatorManager.StoredActions);
}
UpdateSensors();

/// </param>
public virtual void OnActionReceived(ActionBuffers actions)
{
actions.PackActions(m_LegacyActionCache);
if (!actions.ContinuousActions.IsEmpty())
{
m_LegacyActionCache = actions.ContinuousActions.Array;
}
else
{
m_LegacyActionCache = Array.ConvertAll(actions.DiscreteActions.Array, x => (float)x);
}
OnActionReceived(m_LegacyActionCache);
}

{
OnEpisodeBegin();
}
}
/// <summary>

10
com.unity.ml-agents/Runtime/Agent.deprecated.cs


// [Obsolete("GetAction has been deprecated, please use GetStoredActionBuffers, Or GetStoredDiscreteActions.")]
public float[] GetAction()
{
return m_Info.storedVectorActions;
var storedAction = m_Info.storedVectorActions;
if (!storedAction.ContinuousActions.IsEmpty())
{
return storedAction.ContinuousActions.Array;
}
else
{
return Array.ConvertAll(storedAction.DiscreteActions.Array, x => (float)x);
}
}
}
}

77
com.unity.ml-agents/Runtime/Communicator/GrpcExtensions.cs


var agentInfoProto = ai.ToAgentInfoProto();
var agentActionProto = new AgentActionProto();
if (ai.storedVectorActions != null)
if (!ai.storedVectorActions.IsEmpty())
agentActionProto.VectorActions.AddRange(ai.storedVectorActions);
if (!ai.storedVectorActions.ContinuousActions.IsEmpty())
{
agentActionProto.ContinuousActions.AddRange(ai.storedVectorActions.ContinuousActions.Array);
}
if (!ai.storedVectorActions.DiscreteActions.IsEmpty())
{
agentActionProto.DiscreteActions.AddRange(ai.storedVectorActions.DiscreteActions.Array);
}
}
return new AgentInfoActionPairProto

return summariesOut;
}
#endregion
#region BrainParameters

{
var brainParametersProto = new BrainParametersProto
{
VectorActionSize = { bp.VectorActionSize },
VectorActionSpaceType = (SpaceTypeProto)bp.VectorActionSpaceType,
VectorActionSizeDeprecated = { bp.VectorActionSize },
VectorActionSpaceTypeDeprecated = (SpaceTypeProto)bp.VectorActionSpaceType,
brainParametersProto.VectorActionDescriptions.AddRange(bp.VectorActionDescriptions);
brainParametersProto.VectorActionDescriptionsDeprecated.AddRange(bp.VectorActionDescriptions);
}
return brainParametersProto;
}

/// <param name="isTraining">Whether or not the Brain is training.</param>
public static BrainParametersProto ToBrainParametersProto(this ActionSpec actionSpec, string name, bool isTraining)
{
actionSpec.CheckNotHybrid();
if (actionSpec.NumContinuousActions > 0)
var actionSpecProto = new ActionSpecProto
brainParametersProto.VectorActionSize.Add(actionSpec.NumContinuousActions);
brainParametersProto.VectorActionSpaceType = SpaceTypeProto.Continuous;
NumContinuousActions = actionSpec.NumContinuousActions,
NumDiscreteActions = actionSpec.NumDiscreteActions,
};
if (actionSpec.BranchSizes != null)
{
actionSpecProto.DiscreteBranchSizes.AddRange(actionSpec.BranchSizes);
else if (actionSpec.NumDiscreteActions > 0)
brainParametersProto.ActionSpec = actionSpecProto;
var supportHybrid = Academy.Instance.TrainerCapabilities == null || Academy.Instance.TrainerCapabilities.HybridActions;
if (!supportHybrid)
brainParametersProto.VectorActionSize.AddRange(actionSpec.BranchSizes);
brainParametersProto.VectorActionSpaceType = SpaceTypeProto.Discrete;
actionSpec.CheckAllContinuousOrDiscrete();
if (actionSpec.NumContinuousActions > 0)
{
brainParametersProto.VectorActionSizeDeprecated.Add(actionSpec.NumContinuousActions);
brainParametersProto.VectorActionSpaceTypeDeprecated = SpaceTypeProto.Continuous;
}
else if (actionSpec.NumDiscreteActions > 0)
{
brainParametersProto.VectorActionSizeDeprecated.AddRange(actionSpec.BranchSizes);
brainParametersProto.VectorActionSpaceTypeDeprecated = SpaceTypeProto.Discrete;
}
}
// TODO handle ActionDescriptions?

{
var bp = new BrainParameters
{
VectorActionSize = bpp.VectorActionSize.ToArray(),
VectorActionDescriptions = bpp.VectorActionDescriptions.ToArray(),
VectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceType
VectorActionSize = bpp.VectorActionSizeDeprecated.ToArray(),
VectorActionDescriptions = bpp.VectorActionDescriptionsDeprecated.ToArray(),
VectorActionSpaceType = (SpaceType)bpp.VectorActionSpaceTypeDeprecated
};
return bp;
}

}
return dm;
}
#endregion
public static UnityRLInitParameters ToUnityRLInitParameters(this UnityRLInitializationInputProto inputProto)

}
#region AgentAction
public static List<float[]> ToAgentActionList(this UnityRLInputProto.Types.ListAgentActionProto proto)
public static List<ActionBuffers> ToAgentActionList(this UnityRLInputProto.Types.ListAgentActionProto proto)
var agentActions = new List<float[]>(proto.Value.Count);
var agentActions = new List<ActionBuffers>(proto.Value.Count);
agentActions.Add(ap.VectorActions.ToArray());
agentActions.Add(ap.ToActionBuffers());
public static ActionBuffers ToActionBuffers(this AgentActionProto proto)
{
return new ActionBuffers(proto.ContinuousActions.ToArray(), proto.DiscreteActions.ToArray());
}
#endregion
#region Observations

if (!s_HaveWarnedTrainerCapabilitiesMapping)
{
Debug.LogWarning($"The sensor {sensor.GetName()} is using non-trivial mapping and " +
"the attached trainer doesn't support compression mapping. " +
"Switching to uncompressed observations.");
"the attached trainer doesn't support compression mapping. " +
"Switching to uncompressed observations.");
s_HaveWarnedTrainerCapabilitiesMapping = true;
}
compressionType = SensorCompressionType.None;

$"GetCompressedObservation() returned null data for sensor named {sensor.GetName()}. " +
"You must return a byte[]. If you don't want to use compressed observations, " +
"return SensorCompressionType.None from GetCompressionType()."
);
);
}
observationProto = new ObservationProto
{

observationProto.Shape.AddRange(shape);
return observationProto;
}
#endregion
public static UnityRLCapabilities ToRLCapabilities(this UnityRLCapabilitiesProto proto)

BaseRLCapabilities = proto.BaseRLCapabilities,
ConcatenatedPngObservations = proto.ConcatenatedPngObservations,
CompressedChannelMapping = proto.CompressedChannelMapping,
HybridActions = proto.HybridActions,
};
}

BaseRLCapabilities = rlCaps.BaseRLCapabilities,
ConcatenatedPngObservations = rlCaps.ConcatenatedPngObservations,
CompressedChannelMapping = rlCaps.CompressedChannelMapping,
HybridActions = rlCaps.HybridActions,
};
}

2
com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs


/// <param name="key">A key to identify which behavior actions to get.</param>
/// <param name="agentId">A key to identify which Agent actions to get.</param>
/// <returns></returns>
float[] GetActions(string key, int agentId);
ActionBuffers GetActions(string key, int agentId);
}
}

13
com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs


UnityRLOutputProto m_CurrentUnityRlOutput =
new UnityRLOutputProto();
Dictionary<string, Dictionary<int, float[]>> m_LastActionsReceived =
new Dictionary<string, Dictionary<int, float[]>>();
Dictionary<string, Dictionary<int, ActionBuffers>> m_LastActionsReceived =
new Dictionary<string, Dictionary<int, ActionBuffers>>();
// Brains that we have sent over the communicator with agents.
HashSet<string> m_SentBrainKeys = new HashSet<string>();

{
return false;
}
}
else if (unityVersion.Major != pythonVersion.Major)
{

}
if (!m_LastActionsReceived.ContainsKey(behaviorName))
{
m_LastActionsReceived[behaviorName] = new Dictionary<int, float[]>();
m_LastActionsReceived[behaviorName] = new Dictionary<int, ActionBuffers>();
m_LastActionsReceived[behaviorName][info.episodeId] = null;
m_LastActionsReceived[behaviorName][info.episodeId] = ActionBuffers.Empty;
if (info.done)
{
m_LastActionsReceived[behaviorName].Remove(info.episodeId);

}
}
public float[] GetActions(string behaviorName, int agentId)
public ActionBuffers GetActions(string behaviorName, int agentId)
{
if (m_LastActionsReceived.ContainsKey(behaviorName))
{

}
}
return null;
return ActionBuffers.Empty;
}
/// <summary>

13
com.unity.ml-agents/Runtime/Communicator/UnityRLCapabilities.cs


public bool BaseRLCapabilities;
public bool ConcatenatedPngObservations;
public bool CompressedChannelMapping;
public bool HybridActions;
public UnityRLCapabilities(bool baseRlCapabilities = true, bool concatenatedPngObservations = true, bool compressedChannelMapping = true)
public UnityRLCapabilities(
bool baseRlCapabilities = true,
bool concatenatedPngObservations = true,
bool compressedChannelMapping = true,
bool hybridActions = true)
HybridActions = hybridActions;
}
/// <summary>

return false;
}
Debug.LogWarning("Unity has connected to a Training process that does not support" +
"Base Reinforcement Learning Capabilities. Please make sure you have the" +
" latest training codebase installed for this version of the ML-Agents package.");
"Base Reinforcement Learning Capabilities. Please make sure you have the" +
" latest training codebase installed for this version of the ML-Agents package.");
}
}

82
com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/AgentAction.cs


byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"CjVtbGFnZW50c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2FnZW50X2Fj",
"dGlvbi5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMiSwoQQWdlbnRBY3Rp",
"b25Qcm90bxIWCg52ZWN0b3JfYWN0aW9ucxgBIAMoAhINCgV2YWx1ZRgEIAEo",
"AkoECAIQA0oECAMQBEoECAUQBkIlqgIiVW5pdHkuTUxBZ2VudHMuQ29tbXVu",
"aWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
"dGlvbi5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMijAEKEEFnZW50QWN0",
"aW9uUHJvdG8SIQoZdmVjdG9yX2FjdGlvbnNfZGVwcmVjYXRlZBgBIAMoAhIN",
"CgV2YWx1ZRgEIAEoAhIaChJjb250aW51b3VzX2FjdGlvbnMYBiADKAISGAoQ",
"ZGlzY3JldGVfYWN0aW9ucxgHIAMoBUoECAIQA0oECAMQBEoECAUQBkIlqgIi",
"VW5pdHkuTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IGcHJvdG8z"));
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentActionProto), global::Unity.MLAgents.CommunicatorObjects.AgentActionProto.Parser, new[]{ "VectorActions", "Value" }, null, null, null)
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.AgentActionProto), global::Unity.MLAgents.CommunicatorObjects.AgentActionProto.Parser, new[]{ "VectorActionsDeprecated", "Value", "ContinuousActions", "DiscreteActions" }, null, null, null)
}));
}
#endregion

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public AgentActionProto(AgentActionProto other) : this() {
vectorActions_ = other.vectorActions_.Clone();
vectorActionsDeprecated_ = other.vectorActionsDeprecated_.Clone();
continuousActions_ = other.continuousActions_.Clone();
discreteActions_ = other.discreteActions_.Clone();
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

}
/// <summary>Field number for the "vector_actions" field.</summary>
public const int VectorActionsFieldNumber = 1;
private static readonly pb::FieldCodec<float> _repeated_vectorActions_codec
/// <summary>Field number for the "vector_actions_deprecated" field.</summary>
public const int VectorActionsDeprecatedFieldNumber = 1;
private static readonly pb::FieldCodec<float> _repeated_vectorActionsDeprecated_codec
private readonly pbc::RepeatedField<float> vectorActions_ = new pbc::RepeatedField<float>();
private readonly pbc::RepeatedField<float> vectorActionsDeprecated_ = new pbc::RepeatedField<float>();
/// <summary>
/// mark as deprecated in communicator v1.3.0
/// </summary>
public pbc::RepeatedField<float> VectorActions {
get { return vectorActions_; }
public pbc::RepeatedField<float> VectorActionsDeprecated {
get { return vectorActionsDeprecated_; }
}
/// <summary>Field number for the "value" field.</summary>

}
}
/// <summary>Field number for the "continuous_actions" field.</summary>
public const int ContinuousActionsFieldNumber = 6;
private static readonly pb::FieldCodec<float> _repeated_continuousActions_codec
= pb::FieldCodec.ForFloat(50);
private readonly pbc::RepeatedField<float> continuousActions_ = new pbc::RepeatedField<float>();
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pbc::RepeatedField<float> ContinuousActions {
get { return continuousActions_; }
}
/// <summary>Field number for the "discrete_actions" field.</summary>
public const int DiscreteActionsFieldNumber = 7;
private static readonly pb::FieldCodec<int> _repeated_discreteActions_codec
= pb::FieldCodec.ForInt32(58);
private readonly pbc::RepeatedField<int> discreteActions_ = new pbc::RepeatedField<int>();
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pbc::RepeatedField<int> DiscreteActions {
get { return discreteActions_; }
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as AgentActionProto);

if (ReferenceEquals(other, this)) {
return true;
}
if(!vectorActions_.Equals(other.vectorActions_)) return false;
if(!vectorActionsDeprecated_.Equals(other.vectorActionsDeprecated_)) return false;
if(!continuousActions_.Equals(other.continuousActions_)) return false;
if(!discreteActions_.Equals(other.discreteActions_)) return false;
return Equals(_unknownFields, other._unknownFields);
}

hash ^= vectorActions_.GetHashCode();
hash ^= vectorActionsDeprecated_.GetHashCode();
hash ^= continuousActions_.GetHashCode();
hash ^= discreteActions_.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void WriteTo(pb::CodedOutputStream output) {
vectorActions_.WriteTo(output, _repeated_vectorActions_codec);
vectorActionsDeprecated_.WriteTo(output, _repeated_vectorActionsDeprecated_codec);
continuousActions_.WriteTo(output, _repeated_continuousActions_codec);
discreteActions_.WriteTo(output, _repeated_discreteActions_codec);
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}

public int CalculateSize() {
int size = 0;
size += vectorActions_.CalculateSize(_repeated_vectorActions_codec);
size += vectorActionsDeprecated_.CalculateSize(_repeated_vectorActionsDeprecated_codec);
size += continuousActions_.CalculateSize(_repeated_continuousActions_codec);
size += discreteActions_.CalculateSize(_repeated_discreteActions_codec);
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();
}

if (other == null) {
return;
}
vectorActions_.Add(other.vectorActions_);
vectorActionsDeprecated_.Add(other.vectorActionsDeprecated_);
continuousActions_.Add(other.continuousActions_);
discreteActions_.Add(other.discreteActions_);
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

break;
case 10:
case 13: {
vectorActions_.AddEntriesFrom(input, _repeated_vectorActions_codec);
vectorActionsDeprecated_.AddEntriesFrom(input, _repeated_vectorActionsDeprecated_codec);
break;
}
case 50:
case 53: {
continuousActions_.AddEntriesFrom(input, _repeated_continuousActions_codec);
break;
}
case 58:
case 56: {
discreteActions_.AddEntriesFrom(input, _repeated_discreteActions_codec);
break;
}
}

348
com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/BrainParameters.cs


"CjltbGFnZW50c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2JyYWluX3Bh",
"cmFtZXRlcnMucHJvdG8SFGNvbW11bmljYXRvcl9vYmplY3RzGjNtbGFnZW50",
"c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL3NwYWNlX3R5cGUucHJvdG8i",
"2QEKFEJyYWluUGFyYW1ldGVyc1Byb3RvEhoKEnZlY3Rvcl9hY3Rpb25fc2l6",
"ZRgDIAMoBRIiChp2ZWN0b3JfYWN0aW9uX2Rlc2NyaXB0aW9ucxgFIAMoCRJG",
"Chh2ZWN0b3JfYWN0aW9uX3NwYWNlX3R5cGUYBiABKA4yJC5jb21tdW5pY2F0",
"b3Jfb2JqZWN0cy5TcGFjZVR5cGVQcm90bxISCgpicmFpbl9uYW1lGAcgASgJ",
"EhMKC2lzX3RyYWluaW5nGAggASgISgQIARACSgQIAhADSgQIBBAFQiWqAiJV",
"bml0eS5NTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
"iwEKD0FjdGlvblNwZWNQcm90bxIeChZudW1fY29udGludW91c19hY3Rpb25z",
"GAEgASgFEhwKFG51bV9kaXNjcmV0ZV9hY3Rpb25zGAIgASgFEh0KFWRpc2Ny",
"ZXRlX2JyYW5jaF9zaXplcxgDIAMoBRIbChNhY3Rpb25fZGVzY3JpcHRpb25z",
"GAQgAygJIrYCChRCcmFpblBhcmFtZXRlcnNQcm90bxIlCh12ZWN0b3JfYWN0",
"aW9uX3NpemVfZGVwcmVjYXRlZBgDIAMoBRItCiV2ZWN0b3JfYWN0aW9uX2Rl",
"c2NyaXB0aW9uc19kZXByZWNhdGVkGAUgAygJElEKI3ZlY3Rvcl9hY3Rpb25f",
"c3BhY2VfdHlwZV9kZXByZWNhdGVkGAYgASgOMiQuY29tbXVuaWNhdG9yX29i",
"amVjdHMuU3BhY2VUeXBlUHJvdG8SEgoKYnJhaW5fbmFtZRgHIAEoCRITCgtp",
"c190cmFpbmluZxgIIAEoCBI6CgthY3Rpb25fc3BlYxgJIAEoCzIlLmNvbW11",
"bmljYXRvcl9vYmplY3RzLkFjdGlvblNwZWNQcm90b0oECAEQAkoECAIQA0oE",
"CAQQBUIlqgIiVW5pdHkuTUxBZ2VudHMuQ29tbXVuaWNhdG9yT2JqZWN0c2IG",
"cHJvdG8z"));
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.BrainParametersProto), global::Unity.MLAgents.CommunicatorObjects.BrainParametersProto.Parser, new[]{ "VectorActionSize", "VectorActionDescriptions", "VectorActionSpaceType", "BrainName", "IsTraining" }, null, null, null)
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto), global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto.Parser, new[]{ "NumContinuousActions", "NumDiscreteActions", "DiscreteBranchSizes", "ActionDescriptions" }, null, null, null),
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.BrainParametersProto), global::Unity.MLAgents.CommunicatorObjects.BrainParametersProto.Parser, new[]{ "VectorActionSizeDeprecated", "VectorActionDescriptionsDeprecated", "VectorActionSpaceTypeDeprecated", "BrainName", "IsTraining", "ActionSpec" }, null, null, null)
}));
}
#endregion

internal sealed partial class ActionSpecProto : pb::IMessage<ActionSpecProto> {
private static readonly pb::MessageParser<ActionSpecProto> _parser = new pb::MessageParser<ActionSpecProto>(() => new ActionSpecProto());
private pb::UnknownFieldSet _unknownFields;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public static pb::MessageParser<ActionSpecProto> Parser { get { return _parser; } }
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public static pbr::MessageDescriptor Descriptor {
get { return global::Unity.MLAgents.CommunicatorObjects.BrainParametersReflection.Descriptor.MessageTypes[0]; }
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
pbr::MessageDescriptor pb::IMessage.Descriptor {
get { return Descriptor; }
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public ActionSpecProto() {
OnConstruction();
}
partial void OnConstruction();
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public ActionSpecProto(ActionSpecProto other) : this() {
numContinuousActions_ = other.numContinuousActions_;
numDiscreteActions_ = other.numDiscreteActions_;
discreteBranchSizes_ = other.discreteBranchSizes_.Clone();
actionDescriptions_ = other.actionDescriptions_.Clone();
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public ActionSpecProto Clone() {
return new ActionSpecProto(this);
}
/// <summary>Field number for the "num_continuous_actions" field.</summary>
public const int NumContinuousActionsFieldNumber = 1;
private int numContinuousActions_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public int NumContinuousActions {
get { return numContinuousActions_; }
set {
numContinuousActions_ = value;
}
}
/// <summary>Field number for the "num_discrete_actions" field.</summary>
public const int NumDiscreteActionsFieldNumber = 2;
private int numDiscreteActions_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public int NumDiscreteActions {
get { return numDiscreteActions_; }
set {
numDiscreteActions_ = value;
}
}
/// <summary>Field number for the "discrete_branch_sizes" field.</summary>
public const int DiscreteBranchSizesFieldNumber = 3;
private static readonly pb::FieldCodec<int> _repeated_discreteBranchSizes_codec
= pb::FieldCodec.ForInt32(26);
private readonly pbc::RepeatedField<int> discreteBranchSizes_ = new pbc::RepeatedField<int>();
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pbc::RepeatedField<int> DiscreteBranchSizes {
get { return discreteBranchSizes_; }
}
/// <summary>Field number for the "action_descriptions" field.</summary>
public const int ActionDescriptionsFieldNumber = 4;
private static readonly pb::FieldCodec<string> _repeated_actionDescriptions_codec
= pb::FieldCodec.ForString(34);
private readonly pbc::RepeatedField<string> actionDescriptions_ = new pbc::RepeatedField<string>();
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public pbc::RepeatedField<string> ActionDescriptions {
get { return actionDescriptions_; }
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as ActionSpecProto);
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public bool Equals(ActionSpecProto other) {
if (ReferenceEquals(other, null)) {
return false;
}
if (ReferenceEquals(other, this)) {
return true;
}
if (NumContinuousActions != other.NumContinuousActions) return false;
if (NumDiscreteActions != other.NumDiscreteActions) return false;
if(!discreteBranchSizes_.Equals(other.discreteBranchSizes_)) return false;
if(!actionDescriptions_.Equals(other.actionDescriptions_)) return false;
return Equals(_unknownFields, other._unknownFields);
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override int GetHashCode() {
int hash = 1;
if (NumContinuousActions != 0) hash ^= NumContinuousActions.GetHashCode();
if (NumDiscreteActions != 0) hash ^= NumDiscreteActions.GetHashCode();
hash ^= discreteBranchSizes_.GetHashCode();
hash ^= actionDescriptions_.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}
return hash;
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override string ToString() {
return pb::JsonFormatter.ToDiagnosticString(this);
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void WriteTo(pb::CodedOutputStream output) {
if (NumContinuousActions != 0) {
output.WriteRawTag(8);
output.WriteInt32(NumContinuousActions);
}
if (NumDiscreteActions != 0) {
output.WriteRawTag(16);
output.WriteInt32(NumDiscreteActions);
}
discreteBranchSizes_.WriteTo(output, _repeated_discreteBranchSizes_codec);
actionDescriptions_.WriteTo(output, _repeated_actionDescriptions_codec);
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public int CalculateSize() {
int size = 0;
if (NumContinuousActions != 0) {
size += 1 + pb::CodedOutputStream.ComputeInt32Size(NumContinuousActions);
}
if (NumDiscreteActions != 0) {
size += 1 + pb::CodedOutputStream.ComputeInt32Size(NumDiscreteActions);
}
size += discreteBranchSizes_.CalculateSize(_repeated_discreteBranchSizes_codec);
size += actionDescriptions_.CalculateSize(_repeated_actionDescriptions_codec);
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();
}
return size;
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void MergeFrom(ActionSpecProto other) {
if (other == null) {
return;
}
if (other.NumContinuousActions != 0) {
NumContinuousActions = other.NumContinuousActions;
}
if (other.NumDiscreteActions != 0) {
NumDiscreteActions = other.NumDiscreteActions;
}
discreteBranchSizes_.Add(other.discreteBranchSizes_);
actionDescriptions_.Add(other.actionDescriptions_);
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void MergeFrom(pb::CodedInputStream input) {
uint tag;
while ((tag = input.ReadTag()) != 0) {
switch(tag) {
default:
_unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input);
break;
case 8: {
NumContinuousActions = input.ReadInt32();
break;
}
case 16: {
NumDiscreteActions = input.ReadInt32();
break;
}
case 26:
case 24: {
discreteBranchSizes_.AddEntriesFrom(input, _repeated_discreteBranchSizes_codec);
break;
}
case 34: {
actionDescriptions_.AddEntriesFrom(input, _repeated_actionDescriptions_codec);
break;
}
}
}
}
}
internal sealed partial class BrainParametersProto : pb::IMessage<BrainParametersProto> {
private static readonly pb::MessageParser<BrainParametersProto> _parser = new pb::MessageParser<BrainParametersProto>(() => new BrainParametersProto());
private pb::UnknownFieldSet _unknownFields;

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public static pbr::MessageDescriptor Descriptor {
get { return global::Unity.MLAgents.CommunicatorObjects.BrainParametersReflection.Descriptor.MessageTypes[0]; }
get { return global::Unity.MLAgents.CommunicatorObjects.BrainParametersReflection.Descriptor.MessageTypes[1]; }
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public BrainParametersProto(BrainParametersProto other) : this() {
vectorActionSize_ = other.vectorActionSize_.Clone();
vectorActionDescriptions_ = other.vectorActionDescriptions_.Clone();
vectorActionSpaceType_ = other.vectorActionSpaceType_;
vectorActionSizeDeprecated_ = other.vectorActionSizeDeprecated_.Clone();
vectorActionDescriptionsDeprecated_ = other.vectorActionDescriptionsDeprecated_.Clone();
vectorActionSpaceTypeDeprecated_ = other.vectorActionSpaceTypeDeprecated_;
ActionSpec = other.actionSpec_ != null ? other.ActionSpec.Clone() : null;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

}
/// <summary>Field number for the "vector_action_size" field.</summary>
public const int VectorActionSizeFieldNumber = 3;
private static readonly pb::FieldCodec<int> _repeated_vectorActionSize_codec
/// <summary>Field number for the "vector_action_size_deprecated" field.</summary>
public const int VectorActionSizeDeprecatedFieldNumber = 3;
private static readonly pb::FieldCodec<int> _repeated_vectorActionSizeDeprecated_codec
private readonly pbc::RepeatedField<int> vectorActionSize_ = new pbc::RepeatedField<int>();
private readonly pbc::RepeatedField<int> vectorActionSizeDeprecated_ = new pbc::RepeatedField<int>();
/// <summary>
/// mark as deprecated in communicator v1.3.0
/// </summary>
public pbc::RepeatedField<int> VectorActionSize {
get { return vectorActionSize_; }
public pbc::RepeatedField<int> VectorActionSizeDeprecated {
get { return vectorActionSizeDeprecated_; }
/// <summary>Field number for the "vector_action_descriptions" field.</summary>
public const int VectorActionDescriptionsFieldNumber = 5;
private static readonly pb::FieldCodec<string> _repeated_vectorActionDescriptions_codec
/// <summary>Field number for the "vector_action_descriptions_deprecated" field.</summary>
public const int VectorActionDescriptionsDeprecatedFieldNumber = 5;
private static readonly pb::FieldCodec<string> _repeated_vectorActionDescriptionsDeprecated_codec
private readonly pbc::RepeatedField<string> vectorActionDescriptions_ = new pbc::RepeatedField<string>();
private readonly pbc::RepeatedField<string> vectorActionDescriptionsDeprecated_ = new pbc::RepeatedField<string>();
/// <summary>
/// mark as deprecated in communicator v1.3.0
/// </summary>
public pbc::RepeatedField<string> VectorActionDescriptions {
get { return vectorActionDescriptions_; }
public pbc::RepeatedField<string> VectorActionDescriptionsDeprecated {
get { return vectorActionDescriptionsDeprecated_; }
/// <summary>Field number for the "vector_action_space_type" field.</summary>
public const int VectorActionSpaceTypeFieldNumber = 6;
private global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto vectorActionSpaceType_ = 0;
/// <summary>Field number for the "vector_action_space_type_deprecated" field.</summary>
public const int VectorActionSpaceTypeDeprecatedFieldNumber = 6;
private global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto vectorActionSpaceTypeDeprecated_ = 0;
/// <summary>
/// mark as deprecated in communicator v1.3.0
/// </summary>
public global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto VectorActionSpaceType {
get { return vectorActionSpaceType_; }
public global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto VectorActionSpaceTypeDeprecated {
get { return vectorActionSpaceTypeDeprecated_; }
vectorActionSpaceType_ = value;
vectorActionSpaceTypeDeprecated_ = value;
}
}

}
}
/// <summary>Field number for the "action_spec" field.</summary>
public const int ActionSpecFieldNumber = 9;
private global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto actionSpec_;
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto ActionSpec {
get { return actionSpec_; }
set {
actionSpec_ = value;
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as BrainParametersProto);

if (ReferenceEquals(other, this)) {
return true;
}
if(!vectorActionSize_.Equals(other.vectorActionSize_)) return false;
if(!vectorActionDescriptions_.Equals(other.vectorActionDescriptions_)) return false;
if (VectorActionSpaceType != other.VectorActionSpaceType) return false;
if(!vectorActionSizeDeprecated_.Equals(other.vectorActionSizeDeprecated_)) return false;
if(!vectorActionDescriptionsDeprecated_.Equals(other.vectorActionDescriptionsDeprecated_)) return false;
if (VectorActionSpaceTypeDeprecated != other.VectorActionSpaceTypeDeprecated) return false;
if (!object.Equals(ActionSpec, other.ActionSpec)) return false;
return Equals(_unknownFields, other._unknownFields);
}

hash ^= vectorActionSize_.GetHashCode();
hash ^= vectorActionDescriptions_.GetHashCode();
if (VectorActionSpaceType != 0) hash ^= VectorActionSpaceType.GetHashCode();
hash ^= vectorActionSizeDeprecated_.GetHashCode();
hash ^= vectorActionDescriptionsDeprecated_.GetHashCode();
if (VectorActionSpaceTypeDeprecated != 0) hash ^= VectorActionSpaceTypeDeprecated.GetHashCode();
if (actionSpec_ != null) hash ^= ActionSpec.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public void WriteTo(pb::CodedOutputStream output) {
vectorActionSize_.WriteTo(output, _repeated_vectorActionSize_codec);
vectorActionDescriptions_.WriteTo(output, _repeated_vectorActionDescriptions_codec);
if (VectorActionSpaceType != 0) {
vectorActionSizeDeprecated_.WriteTo(output, _repeated_vectorActionSizeDeprecated_codec);
vectorActionDescriptionsDeprecated_.WriteTo(output, _repeated_vectorActionDescriptionsDeprecated_codec);
if (VectorActionSpaceTypeDeprecated != 0) {
output.WriteEnum((int) VectorActionSpaceType);
output.WriteEnum((int) VectorActionSpaceTypeDeprecated);
}
if (BrainName.Length != 0) {
output.WriteRawTag(58);

output.WriteRawTag(64);
output.WriteBool(IsTraining);
}
if (actionSpec_ != null) {
output.WriteRawTag(74);
output.WriteMessage(ActionSpec);
}
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}

public int CalculateSize() {
int size = 0;
size += vectorActionSize_.CalculateSize(_repeated_vectorActionSize_codec);
size += vectorActionDescriptions_.CalculateSize(_repeated_vectorActionDescriptions_codec);
if (VectorActionSpaceType != 0) {
size += 1 + pb::CodedOutputStream.ComputeEnumSize((int) VectorActionSpaceType);
size += vectorActionSizeDeprecated_.CalculateSize(_repeated_vectorActionSizeDeprecated_codec);
size += vectorActionDescriptionsDeprecated_.CalculateSize(_repeated_vectorActionDescriptionsDeprecated_codec);
if (VectorActionSpaceTypeDeprecated != 0) {
size += 1 + pb::CodedOutputStream.ComputeEnumSize((int) VectorActionSpaceTypeDeprecated);
}
if (BrainName.Length != 0) {
size += 1 + pb::CodedOutputStream.ComputeStringSize(BrainName);

}
if (actionSpec_ != null) {
size += 1 + pb::CodedOutputStream.ComputeMessageSize(ActionSpec);
}
if (_unknownFields != null) {
size += _unknownFields.CalculateSize();

if (other == null) {
return;
}
vectorActionSize_.Add(other.vectorActionSize_);
vectorActionDescriptions_.Add(other.vectorActionDescriptions_);
if (other.VectorActionSpaceType != 0) {
VectorActionSpaceType = other.VectorActionSpaceType;
vectorActionSizeDeprecated_.Add(other.vectorActionSizeDeprecated_);
vectorActionDescriptionsDeprecated_.Add(other.vectorActionDescriptionsDeprecated_);
if (other.VectorActionSpaceTypeDeprecated != 0) {
VectorActionSpaceTypeDeprecated = other.VectorActionSpaceTypeDeprecated;
}
if (other.BrainName.Length != 0) {
BrainName = other.BrainName;

}
if (other.actionSpec_ != null) {
if (actionSpec_ == null) {
actionSpec_ = new global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto();
}
ActionSpec.MergeFrom(other.ActionSpec);
}
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

break;
case 26:
case 24: {
vectorActionSize_.AddEntriesFrom(input, _repeated_vectorActionSize_codec);
vectorActionSizeDeprecated_.AddEntriesFrom(input, _repeated_vectorActionSizeDeprecated_codec);
vectorActionDescriptions_.AddEntriesFrom(input, _repeated_vectorActionDescriptions_codec);
vectorActionDescriptionsDeprecated_.AddEntriesFrom(input, _repeated_vectorActionDescriptionsDeprecated_codec);
vectorActionSpaceType_ = (global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto) input.ReadEnum();
vectorActionSpaceTypeDeprecated_ = (global::Unity.MLAgents.CommunicatorObjects.SpaceTypeProto) input.ReadEnum();
break;
}
case 58: {

case 64: {
IsTraining = input.ReadBool();
break;
}
case 74: {
if (actionSpec_ == null) {
actionSpec_ = new global::Unity.MLAgents.CommunicatorObjects.ActionSpecProto();
}
input.ReadMessage(actionSpec_);
break;
}
}

44
com.unity.ml-agents/Runtime/Grpc/CommunicatorObjects/Capabilities.cs


byte[] descriptorData = global::System.Convert.FromBase64String(
string.Concat(
"CjVtbGFnZW50c19lbnZzL2NvbW11bmljYXRvcl9vYmplY3RzL2NhcGFiaWxp",
"dGllcy5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMifQoYVW5pdHlSTENh",
"cGFiaWxpdGllc1Byb3RvEhoKEmJhc2VSTENhcGFiaWxpdGllcxgBIAEoCBIj",
"Chtjb25jYXRlbmF0ZWRQbmdPYnNlcnZhdGlvbnMYAiABKAgSIAoYY29tcHJl",
"c3NlZENoYW5uZWxNYXBwaW5nGAMgASgIQiWqAiJVbml0eS5NTEFnZW50cy5D",
"b21tdW5pY2F0b3JPYmplY3RzYgZwcm90bzM="));
"dGllcy5wcm90bxIUY29tbXVuaWNhdG9yX29iamVjdHMilAEKGFVuaXR5UkxD",
"YXBhYmlsaXRpZXNQcm90bxIaChJiYXNlUkxDYXBhYmlsaXRpZXMYASABKAgS",
"IwobY29uY2F0ZW5hdGVkUG5nT2JzZXJ2YXRpb25zGAIgASgIEiAKGGNvbXBy",
"ZXNzZWRDaGFubmVsTWFwcGluZxgDIAEoCBIVCg1oeWJyaWRBY3Rpb25zGAQg",
"ASgIQiWqAiJVbml0eS5NTEFnZW50cy5Db21tdW5pY2F0b3JPYmplY3RzYgZw",
"cm90bzM="));
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.UnityRLCapabilitiesProto), global::Unity.MLAgents.CommunicatorObjects.UnityRLCapabilitiesProto.Parser, new[]{ "BaseRLCapabilities", "ConcatenatedPngObservations", "CompressedChannelMapping" }, null, null, null)
new pbr::GeneratedClrTypeInfo(typeof(global::Unity.MLAgents.CommunicatorObjects.UnityRLCapabilitiesProto), global::Unity.MLAgents.CommunicatorObjects.UnityRLCapabilitiesProto.Parser, new[]{ "BaseRLCapabilities", "ConcatenatedPngObservations", "CompressedChannelMapping", "HybridActions" }, null, null, null)
}));
}
#endregion

baseRLCapabilities_ = other.baseRLCapabilities_;
concatenatedPngObservations_ = other.concatenatedPngObservations_;
compressedChannelMapping_ = other.compressedChannelMapping_;
hybridActions_ = other.hybridActions_;
_unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields);
}

}
}
/// <summary>Field number for the "hybridActions" field.</summary>
public const int HybridActionsFieldNumber = 4;
private bool hybridActions_;
/// <summary>
/// support for hybrid action spaces (discrete + continuous)
/// </summary>
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public bool HybridActions {
get { return hybridActions_; }
set {
hybridActions_ = value;
}
}
[global::System.Diagnostics.DebuggerNonUserCodeAttribute]
public override bool Equals(object other) {
return Equals(other as UnityRLCapabilitiesProto);

if (BaseRLCapabilities != other.BaseRLCapabilities) return false;
if (ConcatenatedPngObservations != other.ConcatenatedPngObservations) return false;
if (CompressedChannelMapping != other.CompressedChannelMapping) return false;
if (HybridActions != other.HybridActions) return false;
return Equals(_unknownFields, other._unknownFields);
}

if (BaseRLCapabilities != false) hash ^= BaseRLCapabilities.GetHashCode();
if (ConcatenatedPngObservations != false) hash ^= ConcatenatedPngObservations.GetHashCode();
if (CompressedChannelMapping != false) hash ^= CompressedChannelMapping.GetHashCode();
if (HybridActions != false) hash ^= HybridActions.GetHashCode();
if (_unknownFields != null) {
hash ^= _unknownFields.GetHashCode();
}

output.WriteRawTag(24);
output.WriteBool(CompressedChannelMapping);
}
if (HybridActions != false) {
output.WriteRawTag(32);
output.WriteBool(HybridActions);
}
if (_unknownFields != null) {
_unknownFields.WriteTo(output);
}

size += 1 + 1;
}
if (CompressedChannelMapping != false) {
size += 1 + 1;
}
if (HybridActions != false) {
size += 1 + 1;
}
if (_unknownFields != null) {

if (other.CompressedChannelMapping != false) {
CompressedChannelMapping = other.CompressedChannelMapping;
}
if (other.HybridActions != false) {
HybridActions = other.HybridActions;
}
_unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields);
}

}
case 24: {
CompressedChannelMapping = input.ReadBool();
break;
}
case 32: {
HybridActions = input.ReadBool();
break;
}
}

44
com.unity.ml-agents/Runtime/Inference/ApplierImpl.cs


using System.Collections.Generic;
using System.Linq;
using Unity.MLAgents.Inference.Utils;
using Unity.MLAgents.Actuators;
using Unity.Barracuda;
using UnityEngine;

/// </summary>
internal class ContinuousActionOutputApplier : TensorApplier.IApplier
{
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions)
readonly ActionSpec m_ActionSpec;
public ContinuousActionOutputApplier(ActionSpec actionSpec)
{
m_ActionSpec = actionSpec;
}
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions)
{
var actionSize = tensorProxy.shape[tensorProxy.shape.Length - 1];
var agentIndex = 0;

{
var actionValue = lastActions[agentId];
if (actionValue == null)
var actionBuffer = lastActions[agentId];
if (actionBuffer.IsEmpty())
actionValue = new float[actionSize];
lastActions[agentId] = actionValue;
actionBuffer = new ActionBuffers(m_ActionSpec);
lastActions[agentId] = actionBuffer;
var continuousBuffer = actionBuffer.ContinuousActions;
actionValue[j] = tensorProxy.data[agentIndex, j];
continuousBuffer[j] = tensorProxy.data[agentIndex, j];
}
}
agentIndex++;

readonly int[] m_ActionSize;
readonly Multinomial m_Multinomial;
readonly ITensorAllocator m_Allocator;
readonly ActionSpec m_ActionSpec;
public DiscreteActionOutputApplier(int[] actionSize, int seed, ITensorAllocator allocator)
public DiscreteActionOutputApplier(ActionSpec actionSpec, int seed, ITensorAllocator allocator)
m_ActionSize = actionSize;
m_ActionSize = actionSpec.BranchSizes;
m_ActionSpec = actionSpec;
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions)
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions)
{
//var tensorDataProbabilities = tensorProxy.Data as float[,];
var idActionPairList = actionIds as List<int> ?? actionIds.ToList();

{
if (lastActions.ContainsKey(agentId))
{
var actionVal = lastActions[agentId];
if (actionVal == null)
var actionBuffer = lastActions[agentId];
if (actionBuffer.IsEmpty())
actionVal = new float[m_ActionSize.Length];
lastActions[agentId] = actionVal;
actionBuffer = new ActionBuffers(m_ActionSpec);
lastActions[agentId] = actionBuffer;
var discreteBuffer = actionBuffer.DiscreteActions;
actionVal[j] = actionValues[agentIndex, j];
discreteBuffer[j] = (int)actionValues[agentIndex, j];
}
}
agentIndex++;

m_Memories = memories;
}
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions)
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions)
{
var agentIndex = 0;
var memorySize = (int)tensorProxy.shape[tensorProxy.shape.Length - 1];

m_Memories = memories;
}
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions)
public void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions)
{
var agentIndex = 0;
var memorySize = (int)tensorProxy.shape[tensorProxy.shape.Length - 1];

237
com.unity.ml-agents/Runtime/Inference/BarracudaModelParamLoader.cs


/// </summary>
internal class BarracudaModelParamLoader
{
enum ModelActionType
{
Unknown,
Discrete,
Continuous
}
/// Generates the Tensor inputs that are expected to be present in the Model.
/// </summary>
/// <param name="model">
/// The Barracuda engine model for loading static parameters.
/// </param>
/// <returns>TensorProxy IEnumerable with the expected Tensor inputs.</returns>
public static IReadOnlyList<TensorProxy> GetInputTensors(Model model)
{
var tensors = new List<TensorProxy>();
if (model == null)
return tensors;
foreach (var input in model.inputs)
{
tensors.Add(new TensorProxy
{
name = input.name,
valueType = TensorProxy.TensorType.FloatingPoint,
data = null,
shape = input.shape.Select(i => (long)i).ToArray()
});
}
foreach (var mem in model.memories)
{
tensors.Add(new TensorProxy
{
name = mem.input,
valueType = TensorProxy.TensorType.FloatingPoint,
data = null,
shape = TensorUtils.TensorShapeFromBarracuda(mem.shape)
});
}
tensors.Sort((el1, el2) => el1.name.CompareTo(el2.name));
return tensors;
}
public static int GetNumVisualInputs(Model model)
{
var count = 0;
if (model == null)
return count;
foreach (var input in model.inputs)
{
if (input.shape.Length == 4)
{
if (input.name.StartsWith(TensorNames.VisualObservationPlaceholderPrefix))
{
count++;
}
}
}
return count;
}
/// <summary>
/// Generates the Tensor outputs that are expected to be present in the Model.
/// </summary>
/// <param name="model">
/// The Barracuda engine model for loading static parameters
/// </param>
/// <returns>TensorProxy IEnumerable with the expected Tensor outputs</returns>
public static string[] GetOutputNames(Model model)
{
var names = new List<string>();
if (model == null)
{
return names.ToArray();
}
names.Add(TensorNames.ActionOutput);
var memory = (int)model.GetTensorByName(TensorNames.MemorySize)[0];
if (memory > 0)
{
foreach (var mem in model.memories)
{
names.Add(mem.output);
}
}
names.Sort();
return names.ToArray();
}
/// <summary>
/// Factory for the ModelParamLoader : Creates a ModelParamLoader and runs the checks
/// on it.
/// </summary>

return failedModelChecks;
}
foreach (var constantName in TensorNames.RequiredConstants)
var hasExpectedTensors = model.CheckExpectedTensors(failedModelChecks);
if (!hasExpectedTensors)
var tensor = model.GetTensorByName(constantName);
if (tensor == null)
{
failedModelChecks.Add($"Required constant \"{constantName}\" was not found in the model file.");
return failedModelChecks;
}
return failedModelChecks;
var memorySize = (int)model.GetTensorByName(TensorNames.MemorySize)[0];
var isContinuousInt = (int)model.GetTensorByName(TensorNames.IsContinuousControl)[0];
var isContinuous = GetActionType(isContinuousInt);
var actionSize = (int)model.GetTensorByName(TensorNames.ActionOutputShape)[0];
if (modelApiVersion == -1)
{
failedModelChecks.Add(

return failedModelChecks;
}
var modelDiscreteActionSize = isContinuous == ModelActionType.Discrete ? actionSize : 0;
var modelContinuousActionSize = isContinuous == ModelActionType.Continuous ? actionSize : 0;
var memorySize = (int)model.GetTensorByName(TensorNames.MemorySize)[0];
if (memorySize == -1)
{
failedModelChecks.Add($"Missing node in the model provided : {TensorNames.MemorySize}");
return failedModelChecks;
}
CheckIntScalarPresenceHelper(new Dictionary<string, int>()
{
{TensorNames.MemorySize, memorySize},
{TensorNames.IsContinuousControl, isContinuousInt},
{TensorNames.ActionOutputShape, actionSize}
})
CheckInputTensorPresence(model, brainParameters, memorySize, sensorComponents)
CheckInputTensorPresence(model, brainParameters, memorySize, isContinuous, sensorComponents)
CheckOutputTensorPresence(model, memorySize)
failedModelChecks.AddRange(
CheckOutputTensorPresence(model, memorySize))
;
CheckOutputTensorShape(model, brainParameters, actuatorComponents, isContinuous, modelContinuousActionSize, modelDiscreteActionSize)
CheckOutputTensorShape(model, brainParameters, actuatorComponents)
/// Converts the integer value in the model corresponding to the type of control to a
/// ModelActionType.
/// </summary>
/// <param name="isContinuousInt">
/// The integer value in the model indicating the type of control
/// </param>
/// <returns>The equivalent ModelActionType</returns>
static ModelActionType GetActionType(int isContinuousInt)
{
ModelActionType isContinuous;
switch (isContinuousInt)
{
case 0:
isContinuous = ModelActionType.Discrete;
break;
case 1:
isContinuous = ModelActionType.Continuous;
break;
default:
isContinuous = ModelActionType.Unknown;
break;
}
return isContinuous;
}
/// <summary>
/// Given a Dictionary of node names to int values, create checks if the values have the
/// invalid value of -1.
/// </summary>
/// <param name="requiredScalarFields"> Mapping from node names to int values</param>
/// <returns>The list the error messages of the checks that failed</returns>
static IEnumerable<string> CheckIntScalarPresenceHelper(
Dictionary<string, int> requiredScalarFields)
{
var failedModelChecks = new List<string>();
foreach (var field in requiredScalarFields)
{
if (field.Value == -1)
{
failedModelChecks.Add($"Missing node in the model provided : {field.Key}");
}
}
return failedModelChecks;
}
/// <summary>
/// Generates failed checks that correspond to inputs expected by the model that are not
/// present in the BrainParameters.
/// </summary>

Model model,
BrainParameters brainParameters,
int memory,
ModelActionType isContinuous,
var tensorsNames = GetInputTensors(model).Select(x => x.name).ToList();
var tensorsNames = model.GetInputNames();
// If there is no Vector Observation Input but the Brain Parameters expect one.
if ((brainParameters.VectorObservationSize != 0) &&

"The model does not contain a Vector Observation Placeholder Input. " +
"The model does not contain a Vector Observation Placeholder Input. " +
"You must set the Vector Observation Space Size to 0.");
}

visObsIndex++;
}
var expectedVisualObs = GetNumVisualInputs(model);
var expectedVisualObs = model.GetNumVisualInputs();
// Check if there's not enough visual sensors (too many would be handled above)
if (expectedVisualObs > visObsIndex)
{

}
// If the model uses discrete control but does not have an input for action masks
if (isContinuous == ModelActionType.Discrete)
if (model.HasDiscreteOutputs())
{
if (!tensorsNames.Contains(TensorNames.ActionMaskPlaceholder))
{

static IEnumerable<string> CheckOutputTensorPresence(Model model, int memory)
{
var failedModelChecks = new List<string>();
// If there is no Action Output.
if (!model.outputs.Contains(TensorNames.ActionOutput))
{
failedModelChecks.Add("The model does not contain an Action Output Node.");
}
// If there is no Recurrent Output but the model is Recurrent.
if (memory > 0)

}
// If the model expects an input but it is not in this list
foreach (var tensor in GetInputTensors(model))
foreach (var tensor in model.GetInputTensors())
{
if (!tensorTester.ContainsKey(tensor.name))
{

BrainParameters brainParameters, TensorProxy tensorProxy,
SensorComponent[] sensorComponents, int observableAttributeTotalSize)
{
// TODO: Update this check after intergrating ActionSpec into BrainParameters
var numberActionsBp = brainParameters.VectorActionSize.Length;
var numberActionsT = tensorProxy.shape[tensorProxy.shape.Length - 1];
if (numberActionsBp != numberActionsT)

static IEnumerable<string> CheckOutputTensorShape(
Model model,
BrainParameters brainParameters,
ActuatorComponent[] actuatorComponents,
ModelActionType isContinuous,
int modelContinuousActionSize, int modelSumDiscreteBranchSizes)
ActuatorComponent[] actuatorComponents)
if (isContinuous == ModelActionType.Unknown)
{
failedModelChecks.Add("Cannot infer type of Control from the provided model.");
return failedModelChecks;
}
if (isContinuous == ModelActionType.Continuous &&
brainParameters.VectorActionSpaceType != SpaceType.Continuous)
{
failedModelChecks.Add(
"Model has been trained using Continuous Control but the Brain Parameters " +
"suggest Discrete Control.");
return failedModelChecks;
}
if (isContinuous == ModelActionType.Discrete &&
brainParameters.VectorActionSpaceType != SpaceType.Discrete)
{
failedModelChecks.Add(
"Model has been trained using Discrete Control but the Brain Parameters " +
"suggest Continuous Control.");
return failedModelChecks;
}
// This will need to change a bit for hybrid action spaces.
if (isContinuous == ModelActionType.Continuous)
if (model.HasContinuousOutputs())
tensorTester[TensorNames.ActionOutput] = CheckContinuousActionOutputShape;
tensorTester[model.ContinuousOutputName()] = CheckContinuousActionOutputShape;
else
if (model.HasDiscreteOutputs())
tensorTester[TensorNames.ActionOutput] = CheckDiscreteActionOutputShape;
tensorTester[model.DiscreteOutputName()] = CheckDiscreteActionOutputShape;
var modelContinuousActionSize = model.ContinuousOutputSize();
var modelSumDiscreteBranchSizes = model.DiscreteOutputSize();
foreach (var name in model.outputs)
{
if (tensorTester.ContainsKey(name))

4
com.unity.ml-agents/Runtime/Inference/GeneratorImpl.cs


foreach (var infoSensorPair in infos)
{
var info = infoSensorPair.agentInfo;
var pastAction = info.storedVectorActions;
if (pastAction != null)
var pastAction = info.storedVectorActions.DiscreteActions;
if (!pastAction.IsEmpty())
{
for (var j = 0; j < actionSize; j++)
{

12
com.unity.ml-agents/Runtime/Inference/ModelRunner.cs


internal class ModelRunner
{
List<AgentInfoSensorsPair> m_Infos = new List<AgentInfoSensorsPair>();
Dictionary<int, float[]> m_LastActionsReceived = new Dictionary<int, float[]>();
Dictionary<int, ActionBuffers> m_LastActionsReceived = new Dictionary<int, ActionBuffers>();
List<int> m_OrderedAgentsRequestingDecisions = new List<int>();
ITensorAllocator m_TensorAllocator;

m_Engine = null;
}
m_InferenceInputs = BarracudaModelParamLoader.GetInputTensors(barracudaModel);
m_OutputNames = BarracudaModelParamLoader.GetOutputNames(barracudaModel);
m_InferenceInputs = barracudaModel.GetInputTensors();
m_OutputNames = barracudaModel.GetOutputNames();
m_TensorGenerator = new TensorGenerator(
seed, m_TensorAllocator, m_Memories, barracudaModel);
m_TensorApplier = new TensorApplier(

if (!m_LastActionsReceived.ContainsKey(info.episodeId))
{
m_LastActionsReceived[info.episodeId] = null;
m_LastActionsReceived[info.episodeId] = ActionBuffers.Empty;
}
if (info.done)
{

return m_Model == other && m_InferenceDevice == otherInferenceDevice;
}
public float[] GetAction(int agentId)
public ActionBuffers GetAction(int agentId)
return null;
return ActionBuffers.Empty;
}
}
}

35
com.unity.ml-agents/Runtime/Inference/TensorApplier.cs


/// </param>
/// <param name="actionIds"> List of Agents Ids that will be updated using the tensor's data</param>
/// <param name="lastActions"> Dictionary of AgentId to Actions to be updated</param>
void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions);
void Apply(TensorProxy tensorProxy, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions);
}
readonly Dictionary<string, IApplier> m_Dict = new Dictionary<string, IApplier>();

Dictionary<int, List<float>> memories,
object barracudaModel = null)
{
actionSpec.CheckNotHybrid();
// If model is null, no inference to run and exception is thrown before reaching here.
if (barracudaModel == null)
{
return;
}
var model = (Model)barracudaModel;
if (!model.SupportsContinuousAndDiscrete())
{
actionSpec.CheckAllContinuousOrDiscrete();
}
m_Dict[TensorNames.ActionOutput] = new ContinuousActionOutputApplier();
var tensorName = model.ContinuousOutputName();
m_Dict[tensorName] = new ContinuousActionOutputApplier(actionSpec);
else
if (actionSpec.NumDiscreteActions > 0)
m_Dict[TensorNames.ActionOutput] =
new DiscreteActionOutputApplier(actionSpec.BranchSizes, seed, allocator);
var tensorName = model.DiscreteOutputName();
m_Dict[tensorName] = new DiscreteActionOutputApplier(actionSpec, seed, allocator);
if (barracudaModel != null)
for (var i = 0; i < model?.memories.Count; i++)
var model = (Model)barracudaModel;
for (var i = 0; i < model?.memories.Count; i++)
{
m_Dict[model.memories[i].output] =
new BarracudaMemoryOutputApplier(model.memories.Count, i, memories);
}
m_Dict[model.memories[i].output] =
new BarracudaMemoryOutputApplier(model.memories.Count, i, memories);
}
}

/// <exception cref="UnityAgentsException"> One of the tensor does not have an
/// associated applier.</exception>
public void ApplyTensors(
IEnumerable<TensorProxy> tensors, IEnumerable<int> actionIds, Dictionary<int, float[]> lastActions)
IEnumerable<TensorProxy> tensors, IEnumerable<int> actionIds, Dictionary<int, ActionBuffers> lastActions)
{
foreach (var tensor in tensors)
{

26
com.unity.ml-agents/Runtime/Inference/TensorGenerator.cs


Dictionary<int, List<float>> memories,
object barracudaModel = null)
{
// If model is null, no inference to run and exception is thrown before reaching here.
if (barracudaModel == null)
{
return;
}
var model = (Model)barracudaModel;
// Generator for Inputs
m_Dict[TensorNames.BatchSizePlaceholder] =
new BatchSizeGenerator(allocator);

new RecurrentInputGenerator(allocator, memories);
if (barracudaModel != null)
for (var i = 0; i < model.memories.Count; i++)
var model = (Model)barracudaModel;
for (var i = 0; i < model.memories.Count; i++)
{
m_Dict[model.memories[i].input] =
new BarracudaRecurrentInputGenerator(i, allocator, memories);
}
m_Dict[model.memories[i].input] =
new BarracudaRecurrentInputGenerator(i, allocator, memories);
}
m_Dict[TensorNames.PreviousActionPlaceholder] =

// Generators for Outputs
m_Dict[TensorNames.ActionOutput] = new BiDimensionalOutputGenerator(allocator);
if (model.HasContinuousOutputs())
{
m_Dict[model.ContinuousOutputName()] = new BiDimensionalOutputGenerator(allocator);
}
if (model.HasDiscreteOutputs())
{
m_Dict[model.DiscreteOutputName()] = new BiDimensionalOutputGenerator(allocator);
}
m_Dict[TensorNames.RecurrentOutput] = new BiDimensionalOutputGenerator(allocator);
m_Dict[TensorNames.ValueEstimateOutput] = new BiDimensionalOutputGenerator(allocator);
}

15
com.unity.ml-agents/Runtime/Inference/TensorNames.cs


public const string recurrentOutputC = "recurrent_out_c";
public const string MemorySize = "memory_size";
public const string VersionNumber = "version_number";
public const string IsContinuousControl = "is_continuous_control";
public const string ActionOutputShape = "action_output_shape";
public const string ActionOutput = "action";
public const string ContinuousActionOutputShape = "continuous_action_output_shape";
public const string DiscreteActionOutputShape = "discrete_action_output_shape";
public const string ContinuousActionOutput = "continuous_actions";
public const string DiscreteActionOutput = "discrete_actions";
public static readonly string[] RequiredConstants =
{
VersionNumber, MemorySize, IsContinuousControl, ActionOutputShape
};
// Deprecated TensorNames entries for backward compatibility
public const string IsContinuousControlDeprecated = "is_continuous_control";
public const string ActionOutputDeprecated = "action";
public const string ActionOutputShapeDeprecated = "action_output_shape";
}
}

19
com.unity.ml-agents/Runtime/Policies/BarracudaPolicy.cs


/// Sensor shapes for the associated Agents. All Agents must have the same shapes for their Sensors.
/// </summary>
List<int[]> m_SensorShapes;
SpaceType m_SpaceType;
ActionSpec m_ActionSpec;
/// <inheritdoc />
public BarracudaPolicy(

{
var modelRunner = Academy.Instance.GetOrCreateModelRunner(model, actionSpec, inferenceDevice);
m_ModelRunner = modelRunner;
actionSpec.CheckNotHybrid();
m_SpaceType = actionSpec.NumContinuousActions > 0 ? SpaceType.Continuous : SpaceType.Discrete;
m_ActionSpec = actionSpec;
}
/// <inheritdoc />

/// <inheritdoc />
public ref readonly ActionBuffers DecideAction()
{
m_ModelRunner?.DecideBatch();
var actions = m_ModelRunner?.GetAction(m_AgentId);
if (m_SpaceType == SpaceType.Continuous)
if (m_ModelRunner == null)
{
m_LastActionBuffer = ActionBuffers.Empty;
}
else
m_LastActionBuffer = new ActionBuffers(actions, Array.Empty<int>());
return ref m_LastActionBuffer;
m_ModelRunner?.DecideBatch();
m_LastActionBuffer = m_ModelRunner.GetAction(m_AgentId);
m_LastActionBuffer = ActionBuffers.FromDiscreteActions(actions);
return ref m_LastActionBuffer;
}

14
com.unity.ml-agents/Runtime/Policies/RemotePolicy.cs


{
int m_AgentId;
string m_FullyQualifiedBehaviorName;
SpaceType m_SpaceType;
ActionSpec m_ActionSpec;
ActionBuffers m_LastActionBuffer;
internal ICommunicator m_Communicator;

m_FullyQualifiedBehaviorName = fullyQualifiedBehaviorName;
m_Communicator = Academy.Instance.Communicator;
m_Communicator.SubscribeBrain(m_FullyQualifiedBehaviorName, actionSpec);
actionSpec.CheckNotHybrid();
m_SpaceType = actionSpec.NumContinuousActions > 0 ? SpaceType.Continuous : SpaceType.Discrete;
m_ActionSpec = actionSpec;
}
/// <inheritdoc />

{
m_Communicator?.DecideBatch();
var actions = m_Communicator?.GetActions(m_FullyQualifiedBehaviorName, m_AgentId);
// TODO figure out how to handle this with multiple space types.
if (m_SpaceType == SpaceType.Continuous)
{
m_LastActionBuffer = new ActionBuffers(actions, Array.Empty<int>());
return ref m_LastActionBuffer;
}
m_LastActionBuffer = ActionBuffers.FromDiscreteActions(actions);
m_LastActionBuffer = actions == null ? ActionBuffers.Empty : (ActionBuffers)actions;
return ref m_LastActionBuffer;
}

12
com.unity.ml-agents/Tests/Editor/Actuators/ActuatorManagerTests.cs


}
[Test]
public void TestFailOnMixedActionSpace()
{
var manager = new ActuatorManager();
var actuator1 = new TestActuator(ActionSpec.MakeDiscrete(new[] { 1, 2, 3, 4 }), "actuator1");
var actuator2 = new TestActuator(ActionSpec.MakeContinuous(3), "actuator2");
manager.Add(actuator1);
manager.Add(actuator2);
LogAssert.Expect(LogType.Assert, "Actuators on the same Agent must have the same action SpaceType.");
manager.ReadyActuatorsForExecution(new[] { actuator1, actuator2 }, 3, 10, 4);
}
[Test]
public void TestFailOnSameActuatorName()
{
var manager = new ActuatorManager();

3
com.unity.ml-agents/Tests/Editor/DemonstrationTests.cs


using UnityEngine;
using System.IO.Abstractions.TestingHelpers;
using System.Reflection;
using Unity.MLAgents.Actuators;
using Unity.MLAgents.CommunicatorObjects;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Demonstrations;

done = true,
episodeId = 5,
maxStepReached = true,
storedVectorActions = new[] { 0f, 1f },
storedVectorActions = new ActionBuffers(null, new int[] { 0, 1 }),
};

74
com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorApplier.cs


using Unity.Barracuda;
using Unity.MLAgents.Actuators;
using Unity.MLAgents.Inference;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Tests
{

[Test]
public void ApplyContinuousActionOutput()
{
var actionSpec = ActionSpec.MakeContinuous(3);
var inputTensor = new TensorProxy()
{
shape = new long[] { 2, 3 },

var applier = new ContinuousActionOutputApplier();
var applier = new ContinuousActionOutputApplier(actionSpec);
var actionDict = new Dictionary<int, float[]>() { { 0, null }, { 1, null } };
var actionDict = new Dictionary<int, ActionBuffers>() { { 0, ActionBuffers.Empty }, { 1, ActionBuffers.Empty } };
Assert.AreEqual(actionDict[0][0], 1);
Assert.AreEqual(actionDict[0][1], 2);
Assert.AreEqual(actionDict[0][2], 3);
Assert.AreEqual(actionDict[0].ContinuousActions[0], 1);
Assert.AreEqual(actionDict[0].ContinuousActions[1], 2);
Assert.AreEqual(actionDict[0].ContinuousActions[2], 3);
Assert.AreEqual(actionDict[1][0], 4);
Assert.AreEqual(actionDict[1][1], 5);
Assert.AreEqual(actionDict[1][2], 6);
Assert.AreEqual(actionDict[1].ContinuousActions[0], 4);
Assert.AreEqual(actionDict[1].ContinuousActions[1], 5);
Assert.AreEqual(actionDict[1].ContinuousActions[2], 6);
var actionSpec = ActionSpec.MakeDiscrete(new int[] { 2, 3 });
var inputTensor = new TensorProxy()
{
shape = new long[] { 2, 5 },

new[] { 0.5f, 22.5f, 0.1f, 5f, 1f, 4f, 5f, 6f, 7f, 8f })
};
var alloc = new TensorCachingAllocator();
var applier = new DiscreteActionOutputApplier(new[] { 2, 3 }, 0, alloc);
var applier = new DiscreteActionOutputApplier(actionSpec, 0, alloc);
var actionDict = new Dictionary<int, float[]>() { { 0, null }, { 1, null } };
var actionDict = new Dictionary<int, ActionBuffers>() { { 0, ActionBuffers.Empty }, { 1, ActionBuffers.Empty } };
Assert.AreEqual(actionDict[0][0], 1);
Assert.AreEqual(actionDict[0][1], 1);
Assert.AreEqual(actionDict[0].DiscreteActions[0], 1);
Assert.AreEqual(actionDict[0].DiscreteActions[1], 1);
Assert.AreEqual(actionDict[1][0], 1);
Assert.AreEqual(actionDict[1][1], 2);
Assert.AreEqual(actionDict[1].DiscreteActions[0], 1);
Assert.AreEqual(actionDict[1].DiscreteActions[1], 2);
alloc.Dispose();
}
[Test]
public void ApplyHybridActionOutput()
{
var actionSpec = new ActionSpec(3, 2, new int[] { 2, 3 });
var continuousInputTensor = new TensorProxy()
{
shape = new long[] { 2, 3 },
data = new Tensor(2, 3, new float[] { 1, 2, 3, 4, 5, 6 })
};
var discreteInputTensor = new TensorProxy()
{
shape = new long[] { 2, 8 },
data = new Tensor(
2,
5,
new[] { 0.5f, 22.5f, 0.1f, 5f, 1f, 4f, 5f, 6f, 7f, 8f })
};
var continuousApplier = new ContinuousActionOutputApplier(actionSpec);
var alloc = new TensorCachingAllocator();
var discreteApplier = new DiscreteActionOutputApplier(actionSpec, 0, alloc);
var agentIds = new List<int>() { 0, 1 };
// Dictionary from AgentId to Action
var actionDict = new Dictionary<int, ActionBuffers>() { { 0, ActionBuffers.Empty }, { 1, ActionBuffers.Empty } };
continuousApplier.Apply(continuousInputTensor, agentIds, actionDict);
discreteApplier.Apply(discreteInputTensor, agentIds, actionDict);
Assert.AreEqual(actionDict[0].ContinuousActions[0], 1);
Assert.AreEqual(actionDict[0].ContinuousActions[1], 2);
Assert.AreEqual(actionDict[0].ContinuousActions[2], 3);
Assert.AreEqual(actionDict[0].DiscreteActions[0], 1);
Assert.AreEqual(actionDict[0].DiscreteActions[1], 1);
Assert.AreEqual(actionDict[1].ContinuousActions[0], 4);
Assert.AreEqual(actionDict[1].ContinuousActions[1], 5);
Assert.AreEqual(actionDict[1].ContinuousActions[2], 6);
Assert.AreEqual(actionDict[1].DiscreteActions[0], 1);
Assert.AreEqual(actionDict[1].DiscreteActions[1], 2);
alloc.Dispose();
}
}

7
com.unity.ml-agents/Tests/Editor/EditModeTestInternalBrainTensorGenerator.cs


using Unity.Barracuda;
using NUnit.Framework;
using UnityEngine;
using Unity.MLAgents.Actuators;
using Unity.MLAgents.Inference;
using Unity.MLAgents.Policies;
using Unity.MLAgents.Sensors.Reflection;

var infoA = new AgentInfo
{
storedVectorActions = new[] { 1f, 2f },
discreteActionMasks = null
storedVectorActions = new ActionBuffers(null, new[] { 1, 2 }),
discreteActionMasks = null,
storedVectorActions = new[] { 3f, 4f },
storedVectorActions = new ActionBuffers(null, new[] { 3, 4 }),
discreteActionMasks = new[] { true, false, false, false, false },
};

62
com.unity.ml-agents/Tests/Editor/ModelRunnerTest.cs


using Unity.Barracuda;
using Unity.MLAgents.Actuators;
using Unity.MLAgents.Inference;
using Unity.MLAgents.Sensors;
using Unity.MLAgents.Policies;
namespace Unity.MLAgents.Tests

{
const string k_continuous2vis8vec2actionPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action.nn";
const string k_discrete1vis0vec_2_3action_recurrModelPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr.nn";
NNModel continuous2vis8vec2actionModel;
NNModel discrete1vis0vec_2_3action_recurrModel;
const string k_continuousONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action.onnx";
const string k_discreteONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr.onnx";
const string k_hybridONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/hybrid0vis53vec_3c_2daction.onnx";
const string k_continuousNNPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action_deprecated.nn";
const string k_discreteNNPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr_deprecated.nn";
NNModel continuousONNXModel;
NNModel discreteONNXModel;
NNModel hybridONNXModel;
NNModel continuousNNModel;
NNModel discreteNNModel;
Test3DSensorComponent sensor_21_20_3;
Test3DSensorComponent sensor_20_22_3;

return ActionSpec.MakeDiscrete(2, 3);
}
ActionSpec GetHybrid0vis53vec_3c_2dActionSpec()
{
return new ActionSpec(3, 1, new int[] { 2 });
}
continuous2vis8vec2actionModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuous2vis8vec2actionPath, typeof(NNModel));
discrete1vis0vec_2_3action_recurrModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discrete1vis0vec_2_3action_recurrModelPath, typeof(NNModel));
continuousONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuousONNXPath, typeof(NNModel));
discreteONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discreteONNXPath, typeof(NNModel));
hybridONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_hybridONNXPath, typeof(NNModel));
continuousNNModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuousNNPath, typeof(NNModel));
discreteNNModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discreteNNPath, typeof(NNModel));
var go = new GameObject("SensorA");
sensor_21_20_3 = go.AddComponent<Test3DSensorComponent>();
sensor_21_20_3.Sensor = new Test3DSensor("SensorA", 21, 20, 3);

[Test]
public void TestModelExist()
{
Assert.IsNotNull(continuous2vis8vec2actionModel);
Assert.IsNotNull(discrete1vis0vec_2_3action_recurrModel);
Assert.IsNotNull(continuousONNXModel);
Assert.IsNotNull(discreteONNXModel);
Assert.IsNotNull(hybridONNXModel);
Assert.IsNotNull(continuousNNModel);
Assert.IsNotNull(discreteNNModel);
var modelRunner = new ModelRunner(continuous2vis8vec2actionModel, GetContinuous2vis8vec2actionActionSpec());
var modelRunner = new ModelRunner(continuousONNXModel, GetContinuous2vis8vec2actionActionSpec());
modelRunner = new ModelRunner(discrete1vis0vec_2_3action_recurrModel, GetDiscrete1vis0vec_2_3action_recurrModelActionSpec());
modelRunner = new ModelRunner(discreteONNXModel, GetDiscrete1vis0vec_2_3action_recurrModelActionSpec());
modelRunner.Dispose();
modelRunner = new ModelRunner(hybridONNXModel, GetHybrid0vis53vec_3c_2dActionSpec());
modelRunner.Dispose();
modelRunner = new ModelRunner(continuousNNModel, GetContinuous2vis8vec2actionActionSpec());
modelRunner.Dispose();
modelRunner = new ModelRunner(discreteNNModel, GetDiscrete1vis0vec_2_3action_recurrModelActionSpec());
modelRunner.Dispose();
}

var modelRunner = new ModelRunner(continuous2vis8vec2actionModel, GetContinuous2vis8vec2actionActionSpec(), InferenceDevice.CPU);
Assert.True(modelRunner.HasModel(continuous2vis8vec2actionModel, InferenceDevice.CPU));
Assert.False(modelRunner.HasModel(continuous2vis8vec2actionModel, InferenceDevice.GPU));
Assert.False(modelRunner.HasModel(discrete1vis0vec_2_3action_recurrModel, InferenceDevice.CPU));
var modelRunner = new ModelRunner(continuousONNXModel, GetContinuous2vis8vec2actionActionSpec(), InferenceDevice.CPU);
Assert.True(modelRunner.HasModel(continuousONNXModel, InferenceDevice.CPU));
Assert.False(modelRunner.HasModel(continuousONNXModel, InferenceDevice.GPU));
Assert.False(modelRunner.HasModel(discreteONNXModel, InferenceDevice.CPU));
modelRunner.Dispose();
}

var actionSpec = GetDiscrete1vis0vec_2_3action_recurrModelActionSpec();
var modelRunner = new ModelRunner(discrete1vis0vec_2_3action_recurrModel, actionSpec);
var modelRunner = new ModelRunner(discreteONNXModel, actionSpec);
var info1 = new AgentInfo();
info1.episodeId = 1;
modelRunner.PutObservations(info1, new[] { sensor_21_20_3.CreateSensor() }.ToList());

modelRunner.DecideBatch();
Assert.IsNotNull(modelRunner.GetAction(1));
Assert.IsNotNull(modelRunner.GetAction(2));
Assert.IsNull(modelRunner.GetAction(3));
Assert.AreEqual(actionSpec.NumDiscreteActions, modelRunner.GetAction(1).Count());
Assert.IsFalse(modelRunner.GetAction(1).Equals(ActionBuffers.Empty));
Assert.IsFalse(modelRunner.GetAction(2).Equals(ActionBuffers.Empty));
Assert.IsTrue(modelRunner.GetAction(3).Equals(ActionBuffers.Empty));
Assert.AreEqual(actionSpec.NumDiscreteActions, modelRunner.GetAction(1).DiscreteActions.Length);
modelRunner.Dispose();
}
}

212
com.unity.ml-agents/Tests/Editor/ParameterLoaderTest.cs


[TestFixture]
public class ParameterLoaderTest
{
const string k_continuous2vis8vec2actionPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action.nn";
const string k_discrete1vis0vec_2_3action_recurrModelPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr.nn";
NNModel continuous2vis8vec2actionModel;
NNModel discrete1vis0vec_2_3action_recurrModel;
// ONNX model with continuous/discrete action output (support hybrid action)
const string k_continuousONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action.onnx";
const string k_discreteONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr.onnx";
const string k_hybridONNXPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/hybrid0vis53vec_3c_2daction.onnx";
// NN model with single action output (deprecated, does not support hybrid action).
// Same BrainParameters settings as the corresponding ONNX model.
const string k_continuousNNPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action_deprecated.nn";
const string k_discreteNNPath = "Packages/com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr_deprecated.nn";
NNModel continuousONNXModel;
NNModel discreteONNXModel;
NNModel hybridONNXModel;
NNModel continuousNNModel;
NNModel discreteNNModel;
Test3DSensorComponent sensor_21_20_3;
Test3DSensorComponent sensor_20_22_3;

return validBrainParameters;
}
// TODO: update and enable this after integrating action spec into BrainParameters
// BrainParameters GetHybridBrainParameters()
// {
// var validBrainParameters = new BrainParameters();
// validBrainParameters.VectorObservationSize = 53;
// validBrainParameters.VectorActionSize = new[] { 2 };
// validBrainParameters.NumStackedVectorObservations = 1;
// validBrainParameters.VectorActionSpaceType = SpaceType.Discrete;
// return validBrainParameters;
// }
continuous2vis8vec2actionModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuous2vis8vec2actionPath, typeof(NNModel));
discrete1vis0vec_2_3action_recurrModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discrete1vis0vec_2_3action_recurrModelPath, typeof(NNModel));
continuousONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuousONNXPath, typeof(NNModel));
discreteONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discreteONNXPath, typeof(NNModel));
hybridONNXModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_hybridONNXPath, typeof(NNModel));
continuousNNModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_continuousNNPath, typeof(NNModel));
discreteNNModel = (NNModel)AssetDatabase.LoadAssetAtPath(k_discreteNNPath, typeof(NNModel));
var go = new GameObject("SensorA");
sensor_21_20_3 = go.AddComponent<Test3DSensorComponent>();
sensor_21_20_3.Sensor = new Test3DSensor("SensorA", 21, 20, 3);

[Test]
public void TestModelExist()
{
Assert.IsNotNull(continuous2vis8vec2actionModel);
Assert.IsNotNull(discrete1vis0vec_2_3action_recurrModel);
Assert.IsNotNull(continuousONNXModel);
Assert.IsNotNull(discreteONNXModel);
Assert.IsNotNull(hybridONNXModel);
Assert.IsNotNull(continuousNNModel);
Assert.IsNotNull(discreteNNModel);
[Test]
public void TestGetInputTensors1()
[TestCase(true)]
[TestCase(false)]
public void TestGetInputTensorsContinuous(bool useDeprecatedNNModel)
var model = ModelLoader.Load(continuous2vis8vec2actionModel);
var inputTensors = BarracudaModelParamLoader.GetInputTensors(model);
var inputNames = inputTensors.Select(x => x.name).ToList();
var model = useDeprecatedNNModel ? ModelLoader.Load(continuousNNModel) : ModelLoader.Load(continuousONNXModel);
var inputNames = model.GetInputNames();
Assert.AreEqual(3, inputNames.Count);
Assert.AreEqual(3, inputNames.Count());
Assert.AreEqual(2, BarracudaModelParamLoader.GetNumVisualInputs(model));
Assert.AreEqual(2, model.GetNumVisualInputs());
Assert.AreEqual(0, BarracudaModelParamLoader.GetInputTensors(null).Count);
Assert.AreEqual(0, BarracudaModelParamLoader.GetNumVisualInputs(null));
model = null;
Assert.AreEqual(0, model.GetInputTensors().Count);
Assert.AreEqual(0, model.GetNumVisualInputs());
[Test]
public void TestGetInputTensors2()
[TestCase(true)]
[TestCase(false)]
public void TestGetInputTensorsDiscrete(bool useDeprecatedNNModel)
var model = ModelLoader.Load(discrete1vis0vec_2_3action_recurrModel);
var inputTensors = BarracudaModelParamLoader.GetInputTensors(model);
var inputNames = inputTensors.Select(x => x.name).ToList();
var model = useDeprecatedNNModel ? ModelLoader.Load(discreteNNModel) : ModelLoader.Load(discreteONNXModel);
var inputNames = model.GetInputNames();
// Model should contain 2 inputs : recurrent and visual 1
Assert.Contains(TensorNames.VisualObservationPlaceholderPrefix + "0", inputNames);

[Test]
public void TestGetOutputTensors1()
public void TestGetInputTensorsHybrid()
{
var model = ModelLoader.Load(hybridONNXModel);
var inputNames = model.GetInputNames();
Assert.Contains(TensorNames.VectorObservationPlaceholder, inputNames);
}
[TestCase(true)]
[TestCase(false)]
public void TestGetOutputTensorsContinuous(bool useDeprecatedNNModel)
var model = ModelLoader.Load(continuous2vis8vec2actionModel);
var outputNames = BarracudaModelParamLoader.GetOutputNames(model);
Assert.Contains(TensorNames.ActionOutput, outputNames);
var model = useDeprecatedNNModel ? ModelLoader.Load(continuousNNModel) : ModelLoader.Load(continuousONNXModel);
var outputNames = model.GetOutputNames();
var actionOutputName = useDeprecatedNNModel ? TensorNames.ActionOutputDeprecated : TensorNames.ContinuousActionOutput;
Assert.Contains(actionOutputName, outputNames);
Assert.AreEqual(0, BarracudaModelParamLoader.GetOutputNames(null).Count());
model = null;
Assert.AreEqual(0, model.GetOutputNames().Count());
[Test]
public void TestGetOutputTensors2()
[TestCase(true)]
[TestCase(false)]
public void TestGetOutputTensorsDiscrete(bool useDeprecatedNNModel)
var model = ModelLoader.Load(discrete1vis0vec_2_3action_recurrModel);
var outputNames = BarracudaModelParamLoader.GetOutputNames(model);
Assert.Contains(TensorNames.ActionOutput, outputNames);
var model = useDeprecatedNNModel ? ModelLoader.Load(discreteNNModel) : ModelLoader.Load(discreteONNXModel);
var outputNames = model.GetOutputNames();
var actionOutputName = useDeprecatedNNModel ? TensorNames.ActionOutputDeprecated : TensorNames.DiscreteActionOutput;
Assert.Contains(actionOutputName, outputNames);
public void TestCheckModelValid1()
public void TestGetOutputTensorsHybrid()
var model = ModelLoader.Load(continuous2vis8vec2actionModel);
var model = ModelLoader.Load(hybridONNXModel);
var outputNames = model.GetOutputNames();
Assert.AreEqual(2, outputNames.Count());
Assert.Contains(TensorNames.ContinuousActionOutput, outputNames);
Assert.Contains(TensorNames.DiscreteActionOutput, outputNames);
model = null;
Assert.AreEqual(0, model.GetOutputNames().Count());
}
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelValidContinuous(bool useDeprecatedNNModel)
{
var model = useDeprecatedNNModel ? ModelLoader.Load(continuousNNModel) : ModelLoader.Load(continuousONNXModel);
var validBrainParameters = GetContinuous2vis8vec2actionBrainParameters();
var errors = BarracudaModelParamLoader.CheckModel(

Assert.AreEqual(0, errors.Count()); // There should not be any errors
}
[Test]
public void TestCheckModelValid2()
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelValidDiscrete(bool useDeprecatedNNModel)
var model = ModelLoader.Load(discrete1vis0vec_2_3action_recurrModel);
var model = useDeprecatedNNModel ? ModelLoader.Load(discreteNNModel) : ModelLoader.Load(discreteONNXModel);
var validBrainParameters = GetDiscrete1vis0vec_2_3action_recurrModelBrainParameters();
var errors = BarracudaModelParamLoader.CheckModel(

Assert.AreEqual(0, errors.Count()); // There should not be any errors
}
[Test]
public void TestCheckModelThrowsVectorObservation1()
// TODO: update and enable this test after integrating action spec into BrainParameters
// [Test]
// public void TestCheckModelValidHybrid()
// {
// var model = ModelLoader.Load(hybridModel);
// var validBrainParameters = GetHybridBrainParameters();
// var errors = BarracudaModelParamLoader.CheckModel(
// model, validBrainParameters,
// new SensorComponent[] { }, new ActuatorComponent[0]
// );
// Assert.AreEqual(0, errors.Count()); // There should not be any errors
// }
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelThrowsVectorObservationContinuous(bool useDeprecatedNNModel)
var model = ModelLoader.Load(continuous2vis8vec2actionModel);
var model = useDeprecatedNNModel ? ModelLoader.Load(continuousNNModel) : ModelLoader.Load(continuousONNXModel);
var brainParameters = GetContinuous2vis8vec2actionBrainParameters();
brainParameters.VectorObservationSize = 9; // Invalid observation

Assert.Greater(errors.Count(), 0);
}
[Test]
public void TestCheckModelThrowsVectorObservation2()
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelThrowsVectorObservationDiscrete(bool useDeprecatedNNModel)
var model = ModelLoader.Load(discrete1vis0vec_2_3action_recurrModel);
var model = useDeprecatedNNModel ? ModelLoader.Load(discreteNNModel) : ModelLoader.Load(discreteONNXModel);
var brainParameters = GetDiscrete1vis0vec_2_3action_recurrModelBrainParameters();
brainParameters.VectorObservationSize = 1; // Invalid observation

[Test]
public void TestCheckModelThrowsAction1()
// TODO: update and enable this test after integrating action spec into BrainParameters
// [Test]
// public void TestCheckModelThrowsVectorObservationHybrid()
// {
// var model = ModelLoader.Load(hybridModel);
// var brainParameters = GetHybridBrainParameters();
// brainParameters.VectorObservationSize = 9; // Invalid observation
// var errors = BarracudaModelParamLoader.CheckModel(
// model, brainParameters,
// new SensorComponent[] { }, new ActuatorComponent[0]
// );
// Assert.Greater(errors.Count(), 0);
// brainParameters = GetContinuous2vis8vec2actionBrainParameters();
// brainParameters.NumStackedVectorObservations = 2;// Invalid stacking
// errors = BarracudaModelParamLoader.CheckModel(
// model, brainParameters,
// new SensorComponent[] { }, new ActuatorComponent[0]
// );
// Assert.Greater(errors.Count(), 0);
// }
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelThrowsActionContinuous(bool useDeprecatedNNModel)
var model = ModelLoader.Load(continuous2vis8vec2actionModel);
var model = useDeprecatedNNModel ? ModelLoader.Load(continuousNNModel) : ModelLoader.Load(continuousONNXModel);
var brainParameters = GetContinuous2vis8vec2actionBrainParameters();
brainParameters.VectorActionSize = new[] { 3 }; // Invalid action

Assert.Greater(errors.Count(), 0);
}
[Test]
public void TestCheckModelThrowsAction2()
[TestCase(true)]
[TestCase(false)]
public void TestCheckModelThrowsActionDiscrete(bool useDeprecatedNNModel)
var model = ModelLoader.Load(discrete1vis0vec_2_3action_recurrModel);
var model = useDeprecatedNNModel ? ModelLoader.Load(discreteNNModel) : ModelLoader.Load(discreteONNXModel);
var brainParameters = GetDiscrete1vis0vec_2_3action_recurrModelBrainParameters();
brainParameters.VectorActionSize = new[] { 3, 3 }; // Invalid action

errors = BarracudaModelParamLoader.CheckModel(model, brainParameters, new SensorComponent[] { sensor_21_20_3 }, new ActuatorComponent[0]);
Assert.Greater(errors.Count(), 0);
}
// TODO: update and enable this test after integrating action spec into BrainParameters
// [Test]
// public void TestCheckModelThrowsActionHybrid()
// {
// var model = ModelLoader.Load(hybridModel);
// var brainParameters = GetHybridBrainParameters();
// brainParameters.VectorActionSize = new[] { 3 }; // Invalid action
// var errors = BarracudaModelParamLoader.CheckModel(model, brainParameters, new SensorComponent[] { sensor_21_20_3, sensor_20_22_3 }, new ActuatorComponent[0]);
// Assert.Greater(errors.Count(), 0);
// brainParameters = GetContinuous2vis8vec2actionBrainParameters();
// brainParameters.VectorActionSpaceType = SpaceType.Discrete;// Invalid SpaceType
// errors = BarracudaModelParamLoader.CheckModel(model, brainParameters, new SensorComponent[] { sensor_21_20_3, sensor_20_22_3 }, new ActuatorComponent[0]);
// Assert.Greater(errors.Count(), 0);
// }
[Test]
public void TestCheckModelThrowsNoModel()

2
com.unity.ml-agents/Tests/Editor/TestModels/continuous2vis8vec2action_deprecated.nn.meta


fileFormatVersion: 2
guid: a75582ff670094ff2996c1c4ab9dfd15
guid: bf4543cc3c6944794bbba065bdf90079
ScriptedImporter:
fileIDToRecycleName:
11400000: main obj

2
com.unity.ml-agents/Tests/Editor/TestModels/discrete1vis0vec_2_3action_recurr_deprecated.nn.meta


fileFormatVersion: 2
guid: 8a92fbcd96caa4ef5a93dd55c0c36705
guid: 6d6040ad621454dd5b713beb5483e347
ScriptedImporter:
fileIDToRecycleName:
11400000: main obj

8
docs/Getting-Started.md


#### Behavior Parameters : Vector Action Space
An Agent is given instructions in the form of a float array of _actions_.
An Agent is given instructions in the form of actions.
The 3D Balance Ball example is programmed to use continuous action space which
is a a vector of numbers that can vary continuously. More specifically, it uses
a `Space Size` of 2 to control the amount of `x` and `z` rotations to apply to
The 3D Balance Ball example is programmed to use continuous actions, which
are a vector of floating-point numbers that can vary continuously. More specifically,
it uses a `Space Size` of 2 to control the amount of `x` and `z` rotations to apply to
itself to keep the ball balanced on its head.
## Running a pre-trained model

15
docs/Learning-Environment-Create-New.md


- `OnEpisodeBegin()`
- `CollectObservations(VectorSensor sensor)`
- `OnActionReceived(float[] vectorAction)`
- `OnActionReceived(ActionBuffers actionBuffers)`
We overview each of these in more detail in the dedicated subsections below.

```csharp
public float forceMultiplier = 10;
public override void OnActionReceived(float[] vectorAction)
public override void OnActionReceived(ActionBuffers actionBuffers)
controlSignal.x = vectorAction[0];
controlSignal.z = vectorAction[1];
controlSignal.x = actionBuffers.ContinuousActions[0];
controlSignal.z = actionBuffers.ContinuousActions[1];
rBody.AddForce(controlSignal * forceMultiplier);
// Rewards

(which correspond to the keyboard arrow keys):
```csharp
public override void Heuristic(float[] actionsOut)
public override void Heuristic(in ActionBuffers actionsOut)
actionsOut[0] = Input.GetAxis("Horizontal");
actionsOut[1] = Input.GetAxis("Vertical");
var continuousActionsOut = actionsOut.ContinuousActions;
continuousActionsOut[0] = Input.GetAxis("Horizontal");
continuousActionsOut[1] = Input.GetAxis("Vertical");
}
```

80
docs/Learning-Environment-Design-Agents.md


## Actions
An action is an instruction from the Policy that the agent carries out. The
action is passed to the Agent as a parameter when the Academy invokes the
agent's `OnActionReceived()` function. Actions for an agent can take one of two
forms, either **Continuous** or **Discrete**.
When you specify that the vector action space is **Continuous**, the action
parameter passed to the Agent is an array of floating point numbers with length
equal to the `Vector Action Space Size` property. When you specify a
**Discrete** vector action space type, the action parameter is an array
containing integers. Each integer is an index into a list or table of commands.
In the **Discrete** vector action space type, the action parameter is an array
of indices. The number of indices in the array is determined by the number of
branches defined in the `Branches Size` property. Each branch corresponds to an
action table, you can specify the size of each table by modifying the `Branches`
property.
action is passed to the Agent as the `ActionBuffers` parameter when the Academy invokes the
agent's `OnActionReceived()` function. There are two types of actions supported:
**Continuous** and **Discrete**.
Neither the Policy nor the training algorithm know anything about what the
action values themselves mean. The training algorithm simply tries different

### Continuous Action Space
When an Agent uses a Policy set to the **Continuous** vector action space, the
action parameter passed to the Agent's `OnActionReceived()` function is an array
with length equal to the `Vector Action Space Size` property value. The
When an Agent's Policy has **Continuous** actions, the
`ActionBuffers.ContinuousActions` passed to the Agent's `OnActionReceived()` function
is an array with length equal to the `Vector Action Space Size` property value. The
individual values in the array have whatever meanings that you ascribe to them.
If you assign an element in the array as the speed of an Agent, for example, the
training process learns to control the speed of the Agent through this

These control values are applied as torques to the bodies making up the arm:
```csharp
public override void OnActionReceived(float[] act)
{
float torque_x = Mathf.Clamp(act[0], -1, 1) * 100f;
float torque_z = Mathf.Clamp(act[1], -1, 1) * 100f;
rbA.AddTorque(new Vector3(torque_x, 0f, torque_z));
public override void OnActionReceived(ActionBuffers actionBuffers)
{
var torqueX = Mathf.Clamp(actionBuffers.ContinuousActions[0], -1f, 1f) * 150f;
var torqueZ = Mathf.Clamp(actionBuffers.ContinuousActions[1], -1f, 1f) * 150f;
m_RbA.AddTorque(new Vector3(torqueX, 0f, torqueZ));
torque_x = Mathf.Clamp(act[2], -1, 1) * 100f;
torque_z = Mathf.Clamp(act[3], -1, 1) * 100f;
rbB.AddTorque(new Vector3(torque_x, 0f, torque_z));
}
torqueX = Mathf.Clamp(actionBuffers.ContinuousActions[2], -1f, 1f) * 150f;
torqueZ = Mathf.Clamp(actionBuffers.ContinuousActions[3], -1f, 1f) * 150f;
m_RbB.AddTorque(new Vector3(torqueX, 0f, torqueZ));
}
```
By default the output from our provided PPO algorithm pre-clamps the values of

### Discrete Action Space
When an Agent uses a **Discrete** vector action space, the action parameter
passed to the Agent's `OnActionReceived()` function is an array containing
indices. With the discrete vector action space, `Branches` is an array of
integers, each value corresponds to the number of possibilities for each branch.
When an Agent's Policy uses **discrete** actions, the
`ActionBuffers.DiscreteActions` passed to the Agent's `OnActionReceived()` function
is an array of integers. When defining the discrete vector action space, `Branches`
is an array of integers, each value corresponds to the number of possibilities for each branch.
For example, if we wanted an Agent that can move in a plane and jump, we could
define two branches (one for motion and one for jumping) because we want our

```csharp
// Get the action index for movement
int movement = Mathf.FloorToInt(act[0]);
int movement = actionBuffers.DiscreteActions[0];
int jump = Mathf.FloorToInt(act[1]);
int jump = actionBuffers.DiscreteActions[1];
// Look up the index in the movement action list:
if (movement == 1) { directionX = -1; }

directionX * 40f, directionY * 300f, directionZ * 40f));
```
Note that the above code example is a simplified extract from the AreaAgent
class, which provides alternate implementations for both the discrete and the
continuous action spaces.
#### Masking Discrete Actions
When using Discrete Actions, it is possible to specify that some actions are

decide to perform the masked action. In order to mask an action, override the
`Agent.CollectDiscreteActionMasks()` virtual method, and call
`DiscreteActionMasker.SetMask()` in it:
`Agent.WriteDiscreteActionMask()` virtual method, and call
`WriteMask()` on the provided `IDiscreteActionMask`:
public override void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker){
actionMasker.SetMask(branch, actionIndices)
public override void WriteDiscreteActionMask(IDiscreteActionMask actionMask)
{
actionMasker.WriteMask(branch, actionIndices)
}
```

the action
- `actionIndices` is a list of `int` corresponding to the indices of the actions
that the Agent cannot perform.
that the Agent **cannot** perform.
For example, if you have an Agent with 2 branches and on the first branch
(branch 0) there are 4 possible actions : _"do nothing"_, _"jump"_, _"shoot"_

```csharp
SetMask(0, new int[2]{1,2})
WriteMask(0, new int[2]{1,2})
- You can call `SetMask` multiple times if you want to put masks on multiple
- You can call `WriteMask` multiple times if you want to put masks on multiple
branches.
- You cannot mask all the actions of a branch.
- You cannot mask actions in continuous control.

- Actions can either use `Discrete` or `Continuous` spaces.
- When using `Discrete` it is possible to assign multiple action branches, and
to mask certain actions.
- Agents can either use `Discrete` or `Continuous` actions.
- Discrete actions can have multiple action branches, and it's possible to mask
certain actions so that they won't be taken.
- When using continuous control, action values should be clipped to an
- Continuous action values should be clipped to an
appropriate range. The provided PPO model automatically clips these values
between -1 and 1, but third party training systems may not do so.

64
docs/Python-API.md


terminates the communication.
- **Behavior Specs : `env.behavior_specs`** Returns a Mapping of
`BehaviorName` to `BehaviorSpec` objects (read only).
A `BehaviorSpec` contains information such as the observation shapes, the
action type (multi-discrete or continuous) and the action shape. Note that
A `BehaviorSpec` contains the observation shapes and the
`ActionSpec` (which defines the action shape). Note that
the `BehaviorSpec` for a specific group is fixed throughout the simulation.
The number of entries in the Mapping can change over time in the simulation
if new Agent behaviors are created in the simulation.

number of agents is not guaranteed to remain constant during the simulation
and it is not unusual to have either `DecisionSteps` or `TerminalSteps`
contain no Agents at all.
- **Set Actions :`env.set_actions(behavior_name: str, action: np.array)`** Sets
the actions for a whole agent group. `action` is a 2D `np.array` of
`dtype=np.int32` in the discrete action case and `dtype=np.float32` in the
continuous action case. The first dimension of `action` is the number of
agents that requested a decision since the last call to `env.step()`. The
second dimension is the number of discrete actions in multi-discrete action
type and the number of actions in continuous action type.
- **Set Actions :`env.set_actions(behavior_name: str, action: ActionTuple)`** Sets
the actions for a whole agent group. `action` is an `ActionTuple`, which
is made up of a 2D `np.array` of `dtype=np.int32` for discrete actions, and
`dtype=np.float32` for continuous actions. The first dimension of `np.array`
in the tuple is the number of agents that requested a decision since the
last call to `env.step()`. The second dimension is the number of discrete or
continuous actions for the corresponding array.
`env.set_action_for_agent(agent_group: str, agent_id: int, action: np.array)`**
`env.set_action_for_agent(agent_group: str, agent_id: int, action: ActionTuple)`**
identifier of the Agent. Action is a 1D array of type `dtype=np.int32` and
size equal to the number of discrete actions in multi-discrete action type and
of type `dtype=np.float32` and size equal to the number of actions in
continuous action type.
identifier of the Agent. `action` is an `ActionTuple` as described above.
**Note:** If no action is provided for an agent group between two calls to
`env.step()` then the default action will be all zeros (in either discrete or
continuous action space)

- `agent_id` is an int vector of length batch size containing unique identifier
for the corresponding Agent. This is used to track Agents across simulation
steps.
- `action_mask` is an optional list of two dimensional array of booleans. Only
available in multi-discrete action space type. Each array corresponds to an
- `action_mask` is an optional list of two dimensional arrays of booleans which is only
available when using multi-discrete actions. Each array corresponds to an
action branch. The first dimension of each array is the batch size and the
second contains a mask for each action of the branch. If true, the action is
not available for the agent during this simulation step.

- `reward` is a float. Corresponds to the rewards collected by the agent since
the last simulation step.
- `agent_id` is an int and an unique identifier for the corresponding Agent.
- `action_mask` is an optional list of one dimensional array of booleans. Only
available in multi-discrete action space type. Each array corresponds to an
- `action_mask` is an optional list of one dimensional arrays of booleans which is only
available when using multi-discrete actions. Each array corresponds to an
action branch. Each array contains a mask for each action of the branch. If
true, the action is not available for the agent during this simulation step.

#### BehaviorSpec
An Agent behavior can either have discrete or continuous actions. To check which
type it is, use `spec.is_action_discrete()` or `spec.is_action_continuous()` to
see which one it is. If discrete, the action tensors are expected to be
`np.int32`. If continuous, the actions are expected to be `np.float32`.
A `BehaviorSpec` has the following fields :
- `observation_shapes` is a List of Tuples of int : Each Tuple corresponds to an

- `action_type` is the type of data of the action. it can be discrete or
continuous. If discrete, the action tensors are expected to be `np.int32`. If
continuous, the actions are expected to be `np.float32`.
- `action_size` is an `int` corresponding to the expected dimension of the
action array.
- In continuous action space it is the number of floats that constitute the
action.
- In discrete action space (same as multi-discrete) it corresponds to the
number of branches (the number of independent actions)
- `discrete_action_branches` is a Tuple of int only for discrete action space.
Each int corresponds to the number of different options for each branch of the
action. For example : In a game direction input (no movement, left, right) and
- `action_spec` is an `ActionSpec` namedtuple that defines the number and types
of actions for the Agent.
An `ActionSpec` has the following fields and properties:
- `continuous_size` is the number of floats that constitute the continuous actions.
- `discrete_size` is the number of branches (the number of independent actions) that
constitute the multi-discrete actions.
- `discrete_branches` is a Tuple of ints. Each int corresponds to the number of
different options for each branch of the action. For example:
In a game direction input (no movement, left, right) and
the first one with 3 options and the second with 2 options. (`action_size = 2`
the first one with 3 options and the second with 2 options. (`discrete_size = 2`
### Communicating additional information with the Environment

4
docs/Training-Configuration-File.md


A few considerations when deciding to use memory:
- LSTM does not work well with continuous vector action space. Please use
discrete vector action space for better results.
- LSTM does not work well with continuous vector actions. Please use
discrete actions for better results.
- Since the memories must be sent back and forth between Python and Unity, using
too large `memory_size` will slow down training.
- Adding a recurrent layer increases the complexity of the neural network, it is

10
gym-unity/gym_unity/envs/__init__.py


import gym
from gym import error, spaces
from mlagents_envs.base_env import BaseEnv
from mlagents_envs.base_env import ActionTuple, BaseEnv
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs import logging_util

action = self._flattener.lookup_action(action)
action = np.array(action).reshape((1, self.action_size))
self._env.set_actions(self.name, action)
action_tuple = ActionTuple()
if self.group_spec.action_spec.is_continuous():
action_tuple.add_continuous(action)
else:
action_tuple.add_discrete(action)
self._env.set_actions(self.name, action_tuple)
self._env.step()
decision_step, terminal_step = self._env.get_steps(self.name)

148
ml-agents-envs/mlagents_envs/base_env.py


)
class _ActionTupleBase(ABC):
"""
An object whose fields correspond to action data of continuous and discrete
spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
def __init__(
self,
continuous: Optional[np.ndarray] = None,
discrete: Optional[np.ndarray] = None,
):
self._continuous: Optional[np.ndarray] = None
self._discrete: Optional[np.ndarray] = None
if continuous is not None:
self.add_continuous(continuous)
if discrete is not None:
self.add_discrete(discrete)
@property
def continuous(self) -> np.ndarray:
return self._continuous
@property
def discrete(self) -> np.ndarray:
return self._discrete
def add_continuous(self, continuous: np.ndarray) -> None:
if continuous.dtype != np.float32:
continuous = continuous.astype(np.float32, copy=False)
if self._discrete is None:
self._discrete = np.zeros(
(continuous.shape[0], 0), dtype=self.discrete_dtype
)
self._continuous = continuous
def add_discrete(self, discrete: np.ndarray) -> None:
if discrete.dtype != self.discrete_dtype:
discrete = discrete.astype(self.discrete_dtype, copy=False)
if self._continuous is None:
self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
self._discrete = discrete
@property
@abstractmethod
def discrete_dtype(self) -> np.dtype:
pass
class ActionTuple(_ActionTupleBase):
"""
An object whose fields correspond to actions of different types.
Continuous and discrete actions are numpy arrays of type float32 and
int32, respectively and are type checked on construction.
Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
respectively. Note, this also holds when continuous or discrete size is
zero.
"""
@property
def discrete_dtype(self) -> np.dtype:
"""
The dtype of a discrete action.
"""
return np.int32
class ActionSpec(NamedTuple):
"""
A NamedTuple containing utility functions and information about the action spaces

"""
return len(self.discrete_branches)
def empty_action(self, n_agents: int) -> np.ndarray:
def empty_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to an empty action (all zeros)
Generates ActionTuple corresponding to an empty action (all zeros)
if self.is_continuous():
return np.zeros((n_agents, self.continuous_size), dtype=np.float32)
return np.zeros((n_agents, self.discrete_size), dtype=np.int32)
_continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
return ActionTuple(continuous=_continuous, discrete=_discrete)
def random_action(self, n_agents: int) -> np.ndarray:
def random_action(self, n_agents: int) -> ActionTuple:
Generates a numpy array corresponding to a random action (either discrete
Generates ActionTuple corresponding to a random action (either discrete
if self.is_continuous():
action = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
).astype(np.float32)
else:
branch_size = self.discrete_branches
action = np.column_stack(
_continuous = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
)
_discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
if self.discrete_size > 0:
_discrete = np.column_stack(
branch_size[i], # type: ignore
self.discrete_branches[i], # type: ignore
size=(n_agents),
dtype=np.int32,
)

return action
return ActionTuple(continuous=_continuous, discrete=_discrete)
self, actions: np.ndarray, n_agents: Optional[int], name: str
) -> np.ndarray:
self, actions: ActionTuple, n_agents: Optional[int], name: str
) -> ActionTuple:
if self.continuous_size > 0:
_size = self.continuous_size
else:
_size = self.discrete_size
_expected_shape = (n_agents, _size) if n_agents is not None else (_size,)
if actions.shape != _expected_shape:
_expected_shape = (
(n_agents, self.continuous_size)
if n_agents is not None
else (self.continuous_size,)
)
if actions.continuous.shape != _expected_shape:
f"The behavior {name} needs an input of dimension "
f"The behavior {name} needs a continuous input of dimension "
f"received input of dimension {actions.shape}"
f"received input of dimension {actions.continuous.shape}"
_expected_type = np.float32 if self.is_continuous() else np.int32
if actions.dtype != _expected_type:
actions = actions.astype(_expected_type)
_expected_shape = (
(n_agents, self.discrete_size)
if n_agents is not None
else (self.discrete_size,)
)
if actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "
f"received input of dimension {actions.discrete.shape}"
)
return actions
@staticmethod

"""
@abstractmethod
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
:param action: A two dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action.
Actions are np.arrays with dimensions (n_agents, continuous_size) and
(n_agents, discrete_size), respectively.
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
"""
Sets the action for one of the agents in the simulation for the next

:param action: A one dimensional np.ndarray corresponding to the action
(either int or float)
:param action: ActionTuple tuple of continuous and/or discrete action
Actions are np.arrays with dimensions (1, continuous_size) and
(1, discrete_size), respectively. Note, this initial dimensions of 1 is because
this action is meant for a single agent.
"""
@abstractmethod

22
ml-agents-envs/mlagents_envs/communicator_objects/agent_action_pb2.py


name='mlagents_envs/communicator_objects/agent_action.proto',
package='communicator_objects',
syntax='proto3',
serialized_pb=_b('\n5mlagents_envs/communicator_objects/agent_action.proto\x12\x14\x63ommunicator_objects\"K\n\x10\x41gentActionProto\x12\x16\n\x0evector_actions\x18\x01 \x03(\x02\x12\r\n\x05value\x18\x04 \x01(\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\n5mlagents_envs/communicator_objects/agent_action.proto\x12\x14\x63ommunicator_objects\"\x8c\x01\n\x10\x41gentActionProto\x12!\n\x19vector_actions_deprecated\x18\x01 \x03(\x02\x12\r\n\x05value\x18\x04 \x01(\x02\x12\x1a\n\x12\x63ontinuous_actions\x18\x06 \x03(\x02\x12\x18\n\x10\x64iscrete_actions\x18\x07 \x03(\x05J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
)

containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='vector_actions', full_name='communicator_objects.AgentActionProto.vector_actions', index=0,
name='vector_actions_deprecated', full_name='communicator_objects.AgentActionProto.vector_actions_deprecated', index=0,
number=1, type=2, cpp_type=6, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,

message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='continuous_actions', full_name='communicator_objects.AgentActionProto.continuous_actions', index=2,
number=6, type=2, cpp_type=6, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='discrete_actions', full_name='communicator_objects.AgentActionProto.discrete_actions', index=3,
number=7, type=5, cpp_type=1, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],

extension_ranges=[],
oneofs=[
],
serialized_start=79,
serialized_end=154,
serialized_start=80,
serialized_end=220,
)
DESCRIPTOR.message_types_by_name['AgentActionProto'] = _AGENTACTIONPROTO

12
ml-agents-envs/mlagents_envs/communicator_objects/agent_action_pb2.pyi


class AgentActionProto(google___protobuf___message___Message):
DESCRIPTOR: google___protobuf___descriptor___Descriptor = ...
vector_actions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___float]
vector_actions_deprecated = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___float]
continuous_actions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___float]
discrete_actions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___int]
vector_actions : typing___Optional[typing___Iterable[builtin___float]] = None,
vector_actions_deprecated : typing___Optional[typing___Iterable[builtin___float]] = None,
continuous_actions : typing___Optional[typing___Iterable[builtin___float]] = None,
discrete_actions : typing___Optional[typing___Iterable[builtin___int]] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> AgentActionProto: ...

def ClearField(self, field_name: typing_extensions___Literal[u"value",u"vector_actions"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"continuous_actions",u"discrete_actions",u"value",u"vector_actions_deprecated"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"value",b"value",u"vector_actions",b"vector_actions"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"continuous_actions",b"continuous_actions",u"discrete_actions",b"discrete_actions",u"value",b"value",u"vector_actions_deprecated",b"vector_actions_deprecated"]) -> None: ...

82
ml-agents-envs/mlagents_envs/communicator_objects/brain_parameters_pb2.py


name='mlagents_envs/communicator_objects/brain_parameters.proto',
package='communicator_objects',
syntax='proto3',
serialized_pb=_b('\n9mlagents_envs/communicator_objects/brain_parameters.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/space_type.proto\"\xd9\x01\n\x14\x42rainParametersProto\x12\x1a\n\x12vector_action_size\x18\x03 \x03(\x05\x12\"\n\x1avector_action_descriptions\x18\x05 \x03(\t\x12\x46\n\x18vector_action_space_type\x18\x06 \x01(\x0e\x32$.communicator_objects.SpaceTypeProto\x12\x12\n\nbrain_name\x18\x07 \x01(\t\x12\x13\n\x0bis_training\x18\x08 \x01(\x08J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x04\x10\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\n9mlagents_envs/communicator_objects/brain_parameters.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/space_type.proto\"\x8b\x01\n\x0f\x41\x63tionSpecProto\x12\x1e\n\x16num_continuous_actions\x18\x01 \x01(\x05\x12\x1c\n\x14num_discrete_actions\x18\x02 \x01(\x05\x12\x1d\n\x15\x64iscrete_branch_sizes\x18\x03 \x03(\x05\x12\x1b\n\x13\x61\x63tion_descriptions\x18\x04 \x03(\t\"\xb6\x02\n\x14\x42rainParametersProto\x12%\n\x1dvector_action_size_deprecated\x18\x03 \x03(\x05\x12-\n%vector_action_descriptions_deprecated\x18\x05 \x03(\t\x12Q\n#vector_action_space_type_deprecated\x18\x06 \x01(\x0e\x32$.communicator_objects.SpaceTypeProto\x12\x12\n\nbrain_name\x18\x07 \x01(\t\x12\x13\n\x0bis_training\x18\x08 \x01(\x08\x12:\n\x0b\x61\x63tion_spec\x18\t \x01(\x0b\x32%.communicator_objects.ActionSpecProtoJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x04\x10\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
,
dependencies=[mlagents__envs_dot_communicator__objects_dot_space__type__pb2.DESCRIPTOR,])

_ACTIONSPECPROTO = _descriptor.Descriptor(
name='ActionSpecProto',
full_name='communicator_objects.ActionSpecProto',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='num_continuous_actions', full_name='communicator_objects.ActionSpecProto.num_continuous_actions', index=0,
number=1, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='num_discrete_actions', full_name='communicator_objects.ActionSpecProto.num_discrete_actions', index=1,
number=2, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='discrete_branch_sizes', full_name='communicator_objects.ActionSpecProto.discrete_branch_sizes', index=2,
number=3, type=5, cpp_type=1, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='action_descriptions', full_name='communicator_objects.ActionSpecProto.action_descriptions', index=3,
number=4, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=137,
serialized_end=276,
)
_BRAINPARAMETERSPROTO = _descriptor.Descriptor(
name='BrainParametersProto',
full_name='communicator_objects.BrainParametersProto',

fields=[
_descriptor.FieldDescriptor(
name='vector_action_size', full_name='communicator_objects.BrainParametersProto.vector_action_size', index=0,
name='vector_action_size_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_size_deprecated', index=0,
number=3, type=5, cpp_type=1, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,

name='vector_action_descriptions', full_name='communicator_objects.BrainParametersProto.vector_action_descriptions', index=1,
name='vector_action_descriptions_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_descriptions_deprecated', index=1,
number=5, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,

name='vector_action_space_type', full_name='communicator_objects.BrainParametersProto.vector_action_space_type', index=2,
name='vector_action_space_type_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_space_type_deprecated', index=2,
number=6, type=14, cpp_type=8, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,

message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='action_spec', full_name='communicator_objects.BrainParametersProto.action_spec', index=5,
number=9, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],

extension_ranges=[],
oneofs=[
],
serialized_start=137,
serialized_end=354,
serialized_start=279,
serialized_end=589,
_BRAINPARAMETERSPROTO.fields_by_name['vector_action_space_type'].enum_type = mlagents__envs_dot_communicator__objects_dot_space__type__pb2._SPACETYPEPROTO
_BRAINPARAMETERSPROTO.fields_by_name['vector_action_space_type_deprecated'].enum_type = mlagents__envs_dot_communicator__objects_dot_space__type__pb2._SPACETYPEPROTO
_BRAINPARAMETERSPROTO.fields_by_name['action_spec'].message_type = _ACTIONSPECPROTO
DESCRIPTOR.message_types_by_name['ActionSpecProto'] = _ACTIONSPECPROTO
ActionSpecProto = _reflection.GeneratedProtocolMessageType('ActionSpecProto', (_message.Message,), dict(
DESCRIPTOR = _ACTIONSPECPROTO,
__module__ = 'mlagents_envs.communicator_objects.brain_parameters_pb2'
# @@protoc_insertion_point(class_scope:communicator_objects.ActionSpecProto)
))
_sym_db.RegisterMessage(ActionSpecProto)
BrainParametersProto = _reflection.GeneratedProtocolMessageType('BrainParametersProto', (_message.Message,), dict(
DESCRIPTOR = _BRAINPARAMETERSPROTO,

45
ml-agents-envs/mlagents_envs/communicator_objects/brain_parameters_pb2.pyi


builtin___int = int
class ActionSpecProto(google___protobuf___message___Message):
DESCRIPTOR: google___protobuf___descriptor___Descriptor = ...
num_continuous_actions = ... # type: builtin___int
num_discrete_actions = ... # type: builtin___int
discrete_branch_sizes = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___int]
action_descriptions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[typing___Text]
def __init__(self,
*,
num_continuous_actions : typing___Optional[builtin___int] = None,
num_discrete_actions : typing___Optional[builtin___int] = None,
discrete_branch_sizes : typing___Optional[typing___Iterable[builtin___int]] = None,
action_descriptions : typing___Optional[typing___Iterable[typing___Text]] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> ActionSpecProto: ...
def MergeFrom(self, other_msg: google___protobuf___message___Message) -> None: ...
def CopyFrom(self, other_msg: google___protobuf___message___Message) -> None: ...
if sys.version_info >= (3,):
def ClearField(self, field_name: typing_extensions___Literal[u"action_descriptions",u"discrete_branch_sizes",u"num_continuous_actions",u"num_discrete_actions"]) -> None: ...
else:
def ClearField(self, field_name: typing_extensions___Literal[u"action_descriptions",b"action_descriptions",u"discrete_branch_sizes",b"discrete_branch_sizes",u"num_continuous_actions",b"num_continuous_actions",u"num_discrete_actions",b"num_discrete_actions"]) -> None: ...
vector_action_size = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___int]
vector_action_descriptions = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[typing___Text]
vector_action_space_type = ... # type: mlagents_envs___communicator_objects___space_type_pb2___SpaceTypeProto
vector_action_size_deprecated = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[builtin___int]
vector_action_descriptions_deprecated = ... # type: google___protobuf___internal___containers___RepeatedScalarFieldContainer[typing___Text]
vector_action_space_type_deprecated = ... # type: mlagents_envs___communicator_objects___space_type_pb2___SpaceTypeProto
@property
def action_spec(self) -> ActionSpecProto: ...
vector_action_size : typing___Optional[typing___Iterable[builtin___int]] = None,
vector_action_descriptions : typing___Optional[typing___Iterable[typing___Text]] = None,
vector_action_space_type : typing___Optional[mlagents_envs___communicator_objects___space_type_pb2___SpaceTypeProto] = None,
vector_action_size_deprecated : typing___Optional[typing___Iterable[builtin___int]] = None,
vector_action_descriptions_deprecated : typing___Optional[typing___Iterable[typing___Text]] = None,
vector_action_space_type_deprecated : typing___Optional[mlagents_envs___communicator_objects___space_type_pb2___SpaceTypeProto] = None,
action_spec : typing___Optional[ActionSpecProto] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> BrainParametersProto: ...

def ClearField(self, field_name: typing_extensions___Literal[u"brain_name",u"is_training",u"vector_action_descriptions",u"vector_action_size",u"vector_action_space_type"]) -> None: ...
def HasField(self, field_name: typing_extensions___Literal[u"action_spec"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"action_spec",u"brain_name",u"is_training",u"vector_action_descriptions_deprecated",u"vector_action_size_deprecated",u"vector_action_space_type_deprecated"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"brain_name",b"brain_name",u"is_training",b"is_training",u"vector_action_descriptions",b"vector_action_descriptions",u"vector_action_size",b"vector_action_size",u"vector_action_space_type",b"vector_action_space_type"]) -> None: ...
def HasField(self, field_name: typing_extensions___Literal[u"action_spec",b"action_spec"]) -> builtin___bool: ...
def ClearField(self, field_name: typing_extensions___Literal[u"action_spec",b"action_spec",u"brain_name",b"brain_name",u"is_training",b"is_training",u"vector_action_descriptions_deprecated",b"vector_action_descriptions_deprecated",u"vector_action_size_deprecated",b"vector_action_size_deprecated",u"vector_action_space_type_deprecated",b"vector_action_space_type_deprecated"]) -> None: ...

13
ml-agents-envs/mlagents_envs/communicator_objects/capabilities_pb2.py


name='mlagents_envs/communicator_objects/capabilities.proto',
package='communicator_objects',
syntax='proto3',
serialized_pb=_b('\n5mlagents_envs/communicator_objects/capabilities.proto\x12\x14\x63ommunicator_objects\"}\n\x18UnityRLCapabilitiesProto\x12\x1a\n\x12\x62\x61seRLCapabilities\x18\x01 \x01(\x08\x12#\n\x1b\x63oncatenatedPngObservations\x18\x02 \x01(\x08\x12 \n\x18\x63ompressedChannelMapping\x18\x03 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
serialized_pb=_b('\n5mlagents_envs/communicator_objects/capabilities.proto\x12\x14\x63ommunicator_objects\"\x94\x01\n\x18UnityRLCapabilitiesProto\x12\x1a\n\x12\x62\x61seRLCapabilities\x18\x01 \x01(\x08\x12#\n\x1b\x63oncatenatedPngObservations\x18\x02 \x01(\x08\x12 \n\x18\x63ompressedChannelMapping\x18\x03 \x01(\x08\x12\x15\n\rhybridActions\x18\x04 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
)

message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='hybridActions', full_name='communicator_objects.UnityRLCapabilitiesProto.hybridActions', index=3,
number=4, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None, file=DESCRIPTOR),
],
extensions=[
],

extension_ranges=[],
oneofs=[
],
serialized_start=79,
serialized_end=204,
serialized_start=80,
serialized_end=228,
)
DESCRIPTOR.message_types_by_name['UnityRLCapabilitiesProto'] = _UNITYRLCAPABILITIESPROTO

6
ml-agents-envs/mlagents_envs/communicator_objects/capabilities_pb2.pyi


baseRLCapabilities = ... # type: builtin___bool
concatenatedPngObservations = ... # type: builtin___bool
compressedChannelMapping = ... # type: builtin___bool
hybridActions = ... # type: builtin___bool
def __init__(self,
*,

hybridActions : typing___Optional[builtin___bool] = None,
) -> None: ...
@classmethod
def FromString(cls, s: builtin___bytes) -> UnityRLCapabilitiesProto: ...

def ClearField(self, field_name: typing_extensions___Literal[u"baseRLCapabilities",u"compressedChannelMapping",u"concatenatedPngObservations"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"baseRLCapabilities",u"compressedChannelMapping",u"concatenatedPngObservations",u"hybridActions"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"baseRLCapabilities",b"baseRLCapabilities",u"compressedChannelMapping",b"compressedChannelMapping",u"concatenatedPngObservations",b"concatenatedPngObservations"]) -> None: ...
def ClearField(self, field_name: typing_extensions___Literal[u"baseRLCapabilities",b"baseRLCapabilities",u"compressedChannelMapping",b"compressedChannelMapping",u"concatenatedPngObservations",b"concatenatedPngObservations",u"hybridActions",b"hybridActions"]) -> None: ...

30
ml-agents-envs/mlagents_envs/environment.py


DecisionSteps,
TerminalSteps,
BehaviorSpec,
ActionTuple,
BehaviorName,
AgentId,
BehaviorMapping,

# * 1.0.0 - initial version
# * 1.1.0 - support concatenated PNGs for compressed observations.
# * 1.2.0 - support compression mapping for stacked compressed observations.
API_VERSION = "1.2.0"
# * 1.3.0 - support action spaces with both continuous and discrete actions.
API_VERSION = "1.3.0"
# Default port that the editor listens on. If an environment executable
# isn't specified, this port will be used.

capabilities.baseRLCapabilities = True
capabilities.concatenatedPngObservations = True
capabilities.compressedChannelMapping = True
capabilities.hybridActions = True
return capabilities
@staticmethod

self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self._env_specs: Dict[str, BehaviorSpec] = {}
self._env_actions: Dict[str, np.ndarray] = {}
self._env_actions: Dict[str, ActionTuple] = {}
self._is_first_message = True
self._update_behavior_specs(aca_output)

f"agent group in the environment"
)
def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:
return

self._env_actions[behavior_name] = action
def set_action_for_agent(
self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
) -> None:
self._assert_behavior_exists(behavior_name)
if behavior_name not in self._env_state:

agent_id
)
) from ie
self._env_actions[behavior_name][index] = action
if action_spec.continuous_size > 0:
self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
if action_spec.discrete_size > 0:
self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
def get_steps(
self, behavior_name: BehaviorName

@timed
def _generate_step_input(
self, vector_action: Dict[str, np.ndarray]
self, vector_action: Dict[str, ActionTuple]
) -> UnityInputProto:
rl_in = UnityRLInputProto()
for b in vector_action:

for i in range(n_agents):
action = AgentActionProto(vector_actions=vector_action[b][i])
action = AgentActionProto()
if vector_action[b].continuous is not None:
action.vector_actions_deprecated.extend(
vector_action[b].continuous[i]
)
action.continuous_actions.extend(vector_action[b].continuous[i])
if vector_action[b].discrete is not None:
action.vector_actions_deprecated.extend(
vector_action[b].discrete[i]
)
action.discrete_actions.extend(vector_action[b].discrete[i])
rl_in.agent_actions[b].value.extend([action])
rl_in.command = STEP
rl_in.side_channel = bytes(

18
ml-agents-envs/mlagents_envs/mock_communicator.py


from .communicator import Communicator
from .environment import UnityEnvironment
from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.brain_parameters_pb2 import (
BrainParametersProto,
ActionSpecProto,
)
from mlagents_envs.communicator_objects.unity_rl_initialization_output_pb2 import (
UnityRLInitializationOutputProto,
)

NONE as COMPRESSION_TYPE_NONE,
PNG as COMPRESSION_TYPE_PNG,
)
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
class MockCommunicator(Communicator):

self.vec_obs_size = vec_obs_size
def initialize(self, inputs: UnityInputProto) -> UnityOutputProto:
if self.is_discrete:
action_spec = ActionSpecProto(
num_discrete_actions=2, discrete_branch_sizes=[3, 2]
)
else:
action_spec = ActionSpecProto(num_continuous_actions=2)
vector_action_size=[2],
vector_action_descriptions=["", ""],
vector_action_space_type=discrete if self.is_discrete else continuous,
brain_name=self.brain_name,
is_training=True,
brain_name=self.brain_name, is_training=True, action_spec=action_spec
)
rl_init = UnityRLInitializationOutputProto(
name="RealFakeAcademy",

23
ml-agents-envs/mlagents_envs/rpc_utils.py


from mlagents_envs.base_env import (
BehaviorSpec,
BehaviorSpec,
DecisionSteps,
TerminalSteps,
)

:return: BehaviorSpec object.
"""
observation_shape = [tuple(obs.shape) for obs in agent_info.observations]
if brain_param_proto.vector_action_space_type == 1:
action_spec = ActionSpec(brain_param_proto.vector_action_size[0], ())
# proto from comminicator < v1.3 does not set action spec, use deprecated fields instead
if (
brain_param_proto.action_spec.num_continuous_actions == 0
and brain_param_proto.action_spec.num_discrete_actions == 0
):
if brain_param_proto.vector_action_space_type_deprecated == 1:
action_spec = ActionSpec(
brain_param_proto.vector_action_size_deprecated[0], ()
)
else:
action_spec = ActionSpec(
0, tuple(brain_param_proto.vector_action_size_deprecated)
)
action_spec = ActionSpec(0, tuple(brain_param_proto.vector_action_size))
action_spec_proto = brain_param_proto.action_spec
action_spec = ActionSpec(
action_spec_proto.num_continuous_actions,
tuple(branch for branch in action_spec_proto.discrete_branch_sizes),
)
return BehaviorSpec(observation_shape, action_spec)

6
ml-agents-envs/mlagents_envs/tests/test_envs.py


import pytest
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.base_env import DecisionSteps, TerminalSteps, ActionTuple
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException
from mlagents_envs.mock_communicator import MockCommunicator

env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents) - 1)
_empty_act = spec.action_spec.empty_action(n_agents)
next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1)
env.set_actions("RealFakeBrain", next_action)
env.step()
env.close()

33
ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py


return agent_info_protos
# The arguments here are the DecisionSteps, TerminalSteps and actions for a single agent name
# The arguments here are the DecisionSteps, TerminalSteps and continuous/discrete actions for a single agent name
decision_steps: DecisionSteps, terminal_steps: TerminalSteps, actions: np.ndarray
decision_steps: DecisionSteps,
terminal_steps: TerminalSteps,
continuous_actions: np.ndarray,
discrete_actions: np.ndarray,
agent_action_protos = [
AgentActionProto(vector_actions=action) for action in actions
]
agent_action_protos = []
num_agents = (
len(continuous_actions)
if continuous_actions is not None
else len(discrete_actions)
)
for i in range(num_agents):
proto = AgentActionProto()
if continuous_actions is not None:
proto.continuous_actions.extend(continuous_actions[i])
proto.vector_actions_deprecated.extend(continuous_actions[i])
if discrete_actions is not None:
proto.discrete_actions.extend(discrete_actions[i])
proto.vector_actions_deprecated.extend(discrete_actions[i])
agent_action_protos.append(proto)
agent_info_action_pair_protos = [
AgentInfoActionPairProto(agent_info=agent_info_proto, action_info=action_proto)
for agent_info_proto, action_proto in zip(

def test_agent_behavior_spec_from_proto():
agent_proto = generate_list_agent_proto(1, [(3,), (4,)])[0]
bp = BrainParametersProto()
bp.vector_action_size.extend([5, 4])
bp.vector_action_space_type = 0
bp.vector_action_size_deprecated.extend([5, 4])
bp.vector_action_space_type_deprecated = 0
behavior_spec = behavior_spec_from_proto(bp, agent_proto)
assert behavior_spec.action_spec.is_discrete()
assert not behavior_spec.action_spec.is_continuous()

bp = BrainParametersProto()
bp.vector_action_size.extend([6])
bp.vector_action_space_type = 1
bp.vector_action_size_deprecated.extend([6])
bp.vector_action_space_type_deprecated = 1
behavior_spec = behavior_spec_from_proto(bp, agent_proto)
assert not behavior_spec.action_spec.is_discrete()
assert behavior_spec.action_spec.is_continuous()

27
ml-agents-envs/mlagents_envs/tests/test_steps.py


assert specs.discrete_branches == ()
assert specs.discrete_size == 0
assert specs.continuous_size == 3
assert specs.empty_action(5).shape == (5, 3)
assert specs.empty_action(5).dtype == np.float32
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).shape == (5, 1)
assert specs.empty_action(5).dtype == np.int32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
specs = ActionSpec(3, (3,))
assert specs.continuous_size == 3
assert specs.discrete_branches == (3,)
assert specs.discrete_size == 1
assert specs.empty_action(5).continuous.shape == (5, 3)
assert specs.empty_action(5).continuous.dtype == np.float32
assert specs.empty_action(5).discrete.shape == (5, 1)
assert specs.empty_action(5).discrete.dtype == np.int32
def test_action_generator():

zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).continuous
random_action = specs.random_action(4)
print(specs.random_action(4))
random_action = specs.random_action(4).continuous
print(random_action)
assert random_action.dtype == np.float32
assert random_action.shape == (4, action_len)
assert np.min(random_action) >= -1

action_shape = (10, 20, 30)
specs = ActionSpec.create_discrete(action_shape)
zero_action = specs.empty_action(4)
zero_action = specs.empty_action(4).discrete
random_action = specs.random_action(4)
random_action = specs.random_action(4).discrete
assert random_action.dtype == np.int32
assert random_action.shape == (4, len(action_shape))
assert np.min(random_action) >= 0

3
ml-agents/mlagents/trainers/action_info.py


class ActionInfo(NamedTuple):
action: Any
env_action: Any
value: Any
outputs: ActionInfoOutputs
agent_ids: List[AgentId]

return ActionInfo([], [], {}, [])
return ActionInfo([], [], [], {}, [])

23
ml-agents/mlagents/trainers/agent_processor.py


import queue
from mlagents_envs.base_env import (
ActionTuple,
DecisionSteps,
DecisionStep,
TerminalSteps,

from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.behavior_id_utils import get_global_agent_id

done = terminated # Since this is an ongoing step
interrupted = step.interrupted if terminated else False
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
if self.policy.use_continuous_act:
action_pre = stored_take_action_outputs["pre_action"][idx]
else:
action_pre = None
action_probs = stored_take_action_outputs["log_probs"][idx]
stored_actions = stored_take_action_outputs["action"]
action_tuple = ActionTuple(
continuous=stored_actions.continuous[idx],
discrete=stored_actions.discrete[idx],
)
stored_action_probs = stored_take_action_outputs["log_probs"]
log_probs_tuple = LogProbsTuple(
continuous=stored_action_probs.continuous[idx],
discrete=stored_action_probs.discrete[idx],
)
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(

action=action,
action_probs=action_probs,
action_pre=action_pre,
action=action_tuple,
action_probs=log_probs_tuple,
action_mask=action_mask,
prev_action=prev_action,
interrupted=interrupted,

2
ml-agents/mlagents/trainers/buffer.py


class AgentBufferField(list):
"""
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
AgentBufferField with the append method.
"""

18
ml-agents/mlagents/trainers/demo_loader.py


[next_pair_info.agent_info], behavior_spec
)
previous_action = (
np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
np.array(
pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32
)
* 0
pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
pair_infos[idx - 1].action_info.vector_actions_deprecated,
dtype=np.float32,
)
next_done = len(next_terminal_step) == 1

for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
# TODO: update the demonstraction files and read from the new proto format
if behavior_spec.action_spec.continuous_size > 0:
demo_raw_buffer["continuous_action"].append(
current_pair_info.action_info.vector_actions_deprecated
)
if behavior_spec.action_spec.discrete_size > 0:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions_deprecated
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

1
ml-agents/mlagents/trainers/env_manager.py


from abc import ABC, abstractmethod
from typing import List, Dict, NamedTuple, Iterable, Tuple
from mlagents_envs.base_env import (
DecisionSteps,

4
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


[self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
)
prev_action = (
batch["actions"][-1] if not self.policy.use_continuous_act else None
batch["discrete_action"][-1]
if not self.policy.use_continuous_act
else None
)
else:
value_estimates = self.sess.run(self.value_heads, feed_dict)

40
ml-agents/mlagents/trainers/policy/policy.py


from typing import Dict, List, Optional
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import TrainerSettings, NetworkSettings

self.trainer_settings = trainer_settings
self.network_settings: NetworkSettings = trainer_settings.network_settings
self.seed = seed
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException("Trainers do not support mixed action spaces.")
self.act_size = (
list(self.behavior_spec.action_spec.discrete_branches)
if self.behavior_spec.action_spec.is_discrete()

1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
# This line will be removed in the ActionBuffer change
self.num_branches = (
self.behavior_spec.action_spec.continuous_size
+ self.behavior_spec.action_spec.discrete_size
)
self.previous_action_dict: Dict[str, np.array] = {}
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
return np.zeros((num_agents, self.num_branches), dtype=np.int)
return np.zeros(
(num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
)
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
self, agent_ids: List[str], action_tuple: ActionTuple
if action_matrix is None:
return
self.previous_action_dict[agent_id] = action_matrix[index, :]
self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
action_matrix = self.make_empty_previous_action(len(agent_ids))
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]

raise NotImplementedError
@staticmethod
def check_nan_action(action: Optional[np.ndarray]) -> None:
def check_nan_action(action: Optional[ActionTuple]) -> None:
d = np.sum(action)
d = np.sum(action.continuous)
raise RuntimeError("NaN action detected.")
raise RuntimeError("Continuous NaN action detected.")
d = np.sum(action.discrete)
has_nan = np.isnan(d)
if has_nan:
raise RuntimeError("Discrete NaN action detected.")
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:

33
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.exception import UnityException
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
from mlagents.trainers.tf.models import ModelUtils
from mlagents.trainers.settings import TrainerSettings, EncoderType
from mlagents.trainers import __version__

reparameterize,
condition_sigma_on_obs,
)
if (
self.behavior_spec.action_spec.continuous_size > 0
and self.behavior_spec.action_spec.discrete_size > 0
):
raise UnityPolicyException(
"TensorFlow does not support mixed action spaces. Please run with the Torch framework."
)
# for ghost trainer save/load snapshots
self.assign_phs: List[tf.Tensor] = []
self.assign_ops: List[tf.Operation] = []

feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)

)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
if "log_probs" in run_out:
log_probs_tuple = LogProbsTuple()
if self.behavior_spec.action_spec.is_continuous():
log_probs_tuple.add_continuous(run_out["log_probs"])
else:
log_probs_tuple.add_discrete(run_out["log_probs"])
run_out["log_probs"] = log_probs_tuple
if "action" in run_out:
action_tuple = ActionTuple()
env_action_tuple = ActionTuple()
if self.behavior_spec.action_spec.is_continuous():
action_tuple.add_continuous(run_out["pre_action"])
env_action_tuple.add_continuous(run_out["action"])
else:
action_tuple.add_discrete(run_out["action"])
env_action_tuple.add_discrete(run_out["action"])
run_out["action"] = action_tuple
run_out["env_action"] = env_action_tuple
env_action=run_out.get("env_action"),
value=run_out.get("value"),
outputs=run_out,
agent_ids=decision_requests.agent_id,

77
ml-agents/mlagents/trainers/policy/torch_policy.py


SeparateActorCritic,
GlobalSteps,
)
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
EPSILON = 1e-7 # Small value to avoid divide by zero

conditional_sigma=self.condition_sigma_on_obs,
tanh_squash=tanh_squash,
)
self._clip_action = not tanh_squash
# Save the m_size needed for export
self._export_m_size = self.m_size
# m_size needed for training is determined by network, not trainer settings

self._clip_action = not tanh_squash
@property
def export_memory_size(self) -> int:

) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
if self.behavior_spec.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
"""
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.

:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
:return: Tuple of actions, actions clipped to -1, 1, log probabilities (dependent on all_log_probs),
entropies, and output memories, all as Torch Tensors.
:return: Tuple of AgentAction, ActionLogProbs, entropies, and output memories.
if memories is None:
dists, memories = self.actor_critic.get_dists(
vec_obs, vis_obs, masks, memories, seq_len
)
else:
# If we're using LSTM. we need to execute the values to get the critic memories
dists, _, memories = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
action_list, dists
actions, log_probs, entropies, _, memories = self.actor_critic.get_action_stats_and_value(
vec_obs, vis_obs, masks, memories, seq_len
actions = torch.stack(action_list, dim=-1)
if self.use_continuous_act:
actions = actions[:, :, 0]
else:
actions = actions[:, 0, :]
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
if self._clip_action and self.use_continuous_act:
clipped_action = torch.clamp(actions, -3, 3) / 3
else:
clipped_action = actions
return (
actions,
clipped_action,
all_logs if all_log_probs else log_probs,
entropy_sum,
memories,
)
return (actions, log_probs, entropies, memories)
actions: torch.Tensor,
actions: AgentAction,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
dists, value_heads, _ = self.actor_critic.get_dist_and_value(
vec_obs, vis_obs, masks, memories, seq_len
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
log_probs, entropies, value_heads = self.actor_critic.get_stats_and_value(
vec_obs, vis_obs, actions, masks, memories, seq_len
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads
return log_probs, entropies, value_heads
@timed
def evaluate(

run_out = {}
with torch.no_grad():
action, clipped_action, log_probs, entropy, memories = self.sample_actions(
action, log_probs, entropy, memories = self.sample_actions(
run_out["pre_action"] = ModelUtils.to_numpy(action)
run_out["action"] = ModelUtils.to_numpy(clipped_action)
# Todo - make pre_action difference
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
action_tuple = action.to_action_tuple()
run_out["action"] = action_tuple
# This is the clipped action which is not saved to the buffer
# but is exclusively sent to the environment.
env_action_tuple = action.to_action_tuple(clip=self._clip_action)
run_out["env_action"] = env_action_tuple
run_out["log_probs"] = log_probs.to_log_probs_tuple()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0
if self.use_recurrent:

self.check_nan_action(run_out.get("action"))
return ActionInfo(
action=run_out.get("action"),
env_action=run_out.get("env_action"),
value=run_out.get("value"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),

9
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
feed_dict[self.policy.output_pre] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]

13
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils

advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories = [
ModelUtils.list_to_tensor(batch["memory"][i])

vis_obs.append(vis_ob)
else:
vis_obs = []
log_probs, entropy, values = self.policy.evaluate_actions(
vec_obs,
vis_obs,

seq_len=self.policy.sequence_length,
)
old_log_probs = ActionLogProbs.from_dict(batch).flatten()
log_probs = log_probs.flatten()
loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
value_loss = self.ppo_value_loss(
values, old_values, returns, decay_eps, loss_masks

log_probs,
ModelUtils.list_to_tensor(batch["action_probs"]),
old_log_probs,
loss_masks,
)
loss = (

6
ml-agents/mlagents/trainers/sac/optimizer_tf.py


feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
if self.policy.use_continuous_act:
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch[
"continuous_action"
]
feed_dict[policy.output] = batch["actions"]
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]

284
ml-agents/mlagents/trainers/sac/optimizer_torch.py


import numpy as np
from typing import Dict, List, Mapping, cast, Tuple, Optional
from typing import Dict, List, Mapping, NamedTuple, cast, Tuple, Optional
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.settings import TrainerSettings, SACSettings
from contextlib import ExitStack

action_spec: ActionSpec,
):
super().__init__()
self.action_spec = action_spec
if self.action_spec.is_continuous():
self.act_size = self.action_spec.continuous_size
num_value_outs = 1
num_action_ins = self.act_size
num_value_outs = max(sum(action_spec.discrete_branches), 1)
num_action_ins = int(action_spec.continuous_size)
else:
self.act_size = self.action_spec.discrete_branches
num_value_outs = sum(self.act_size)
num_action_ins = 0
self.q1_network = ValueNetwork(
stream_names,
observation_shapes,

)
return q1_out, q2_out
class TargetEntropy(NamedTuple):
discrete: List[float] = [] # One per branch
continuous: float = 0.0
class LogEntCoef(nn.Module):
def __init__(self, discrete, continuous):
super().__init__()
self.discrete = discrete
self.continuous = continuous
def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
super().__init__(policy, trainer_params)
hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)

self.policy = policy
self.act_size = policy.act_size
policy_network_settings = policy.network_settings
self.tau = hyperparameters.tau

name: int(not self.reward_signals[name].ignore_done)
for name in self.stream_names
}
self._action_spec = self.policy.behavior_spec.action_spec
self.policy.behavior_spec.action_spec,
self._action_spec,
)
self.target_network = ValueNetwork(

self.policy.actor_critic.critic, self.target_network, 1.0
)
self._log_ent_coef = torch.nn.Parameter(
torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))),
# We create one entropy coefficient per action, whether discrete or continuous.
_disc_log_ent_coef = torch.nn.Parameter(
torch.log(
torch.as_tensor(
[self.init_entcoef] * len(self._action_spec.discrete_branches)
)
),
if self.policy.use_continuous_act:
self.target_entropy = torch.as_tensor(
-1
* self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
else:
self.target_entropy = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self.act_size
]
_cont_log_ent_coef = torch.nn.Parameter(
torch.log(torch.as_tensor([self.init_entcoef])), requires_grad=True
)
self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef
)
_cont_target = (
-1
* self.continuous_target_entropy_scale
* np.prod(self._action_spec.continuous_size).astype(np.float32)
)
_disc_target = [
self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
for i in self._action_spec.discrete_branches
]
self.target_entropy = TorchSACOptimizer.TargetEntropy(
continuous=_cont_target, discrete=_disc_target
)
self.policy.actor_critic.distribution.parameters()
self.policy.actor_critic.action_model.parameters()
)
value_params = list(self.value_network.parameters()) + list(
self.policy.actor_critic.critic.parameters()

value_params, lr=hyperparameters.learning_rate
)
self.entropy_optimizer = torch.optim.Adam(
[self._log_ent_coef], lr=hyperparameters.learning_rate
self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate
)
self._move_to_device(default_device())

def sac_value_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
_cont_ent_coef = self._log_ent_coef.continuous.exp()
_disc_ent_coef = self._log_ent_coef.discrete.exp()
if not discrete:
if self._action_spec.discrete_size <= 0:
action_probs = log_probs.exp()
disc_action_probs = log_probs.all_discrete_tensor.exp()
q1p_out[name] * action_probs, self.act_size
q1p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
q2p_out[name] * action_probs, self.act_size
q2p_out[name] * disc_action_probs,
self._action_spec.discrete_branches,
)
_q1p_mean = torch.mean(
torch.stack(

min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
value_losses = []
if not discrete:
if self._action_spec.discrete_size <= 0:
_ent_coef * log_probs, dim=1
_cont_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

disc_log_probs = log_probs.all_discrete_tensor
log_probs * log_probs.exp(), self.act_size
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True)
for i, _lp in enumerate(branched_per_action_ent)
]
)

branched_ent_bonus, axis=0
)
# Add continuous entropy bonus to minimum Q
if self._action_spec.continuous_size > 0:
v_backup += torch.sum(
_cont_ent_coef * log_probs.continuous_tensor,
dim=1,
keepdim=True,
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup.squeeze()),
loss_masks,

def sac_policy_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
discrete: bool,
_ent_coef = torch.exp(self._log_ent_coef)
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
_cont_ent_coef = _cont_ent_coef.exp()
_disc_ent_coef = _disc_ent_coef.exp()
if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
else:
action_probs = log_probs.exp()
batch_policy_loss = 0
if self._action_spec.discrete_size > 0:
disc_log_probs = log_probs.all_discrete_tensor
disc_action_probs = disc_log_probs.exp()
log_probs * action_probs, self.act_size
disc_log_probs * disc_action_probs, self._action_spec.discrete_branches
mean_q1 * action_probs, self.act_size
mean_q1 * disc_action_probs, self._action_spec.discrete_branches
torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True)
torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False)
for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)

batch_policy_loss = torch.squeeze(branched_policy_loss)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
batch_policy_loss += torch.sum(branched_policy_loss, dim=1)
all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1)
else:
all_mean_q1 = mean_q1
if self._action_spec.continuous_size > 0:
cont_log_probs = log_probs.continuous_tensor
batch_policy_loss += torch.mean(
_cont_ent_coef * cont_log_probs - all_mean_q1.unsqueeze(1), dim=1
)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor
if not discrete:
with torch.no_grad():
target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)
else:
_cont_ent_coef, _disc_ent_coef = (
self._log_ent_coef.continuous,
self._log_ent_coef.discrete,
)
entropy_loss = 0
if self._action_spec.discrete_size > 0:
# Break continuous into separate branch
disc_log_probs = log_probs.all_discrete_tensor
log_probs * log_probs.exp(), self.act_size
disc_log_probs * disc_log_probs.exp(),
self._action_spec.discrete_branches,
branched_per_action_ent, self.target_entropy
branched_per_action_ent, self.target_entropy.discrete
)
],
axis=1,

)
entropy_loss = -1 * ModelUtils.masked_mean(
torch.mean(self._log_ent_coef * target_current_diff, axis=1), loss_masks
entropy_loss += -1 * ModelUtils.masked_mean(
torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks
)
if self._action_spec.continuous_size > 0:
with torch.no_grad():
cont_log_probs = log_probs.continuous_tensor
target_current_diff = torch.sum(
cont_log_probs + self.target_entropy.continuous, dim=1
)
# We update all the _cont_ent_coef as one block
entropy_loss += -1 * ModelUtils.masked_mean(
_cont_ent_coef * target_current_diff, loss_masks
)
return entropy_loss

) -> Dict[str, torch.Tensor]:
condensed_q_output = {}
onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size)
onehot_actions = ModelUtils.actions_to_onehot(
discrete_actions, self._action_spec.discrete_branches
)
branched_q = ModelUtils.break_into_branches(item, self.act_size)
branched_q = ModelUtils.break_into_branches(
item, self._action_spec.discrete_branches
)
only_action_qs = torch.stack(
[
torch.sum(_act * _q, dim=1, keepdim=True)

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.from_dict(batch)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

self.target_network.network_body.copy_normalization(
self.policy.actor_critic.network_body
)
(sampled_actions, _, log_probs, _, _) = self.policy.sample_actions(
(
sampled_actions,
log_probs,
_,
value_estimates,
_,
) = self.policy.actor_critic.get_action_stats_and_value(
seq_len=self.policy.sequence_length,
all_log_probs=not self.policy.use_continuous_act,
sequence_length=self.policy.sequence_length,
value_estimates, _ = self.policy.actor_critic.critic_pass(
vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
cont_sampled_actions = sampled_actions.continuous_tensor
cont_actions = actions.continuous_tensor
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
cont_sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,
if self.policy.use_continuous_act:
squeezed_actions = actions.squeeze(-1)
# Only need grad for q1, as that is used for policy.
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
sampled_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
squeezed_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
cont_actions,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
if self._action_spec.discrete_size > 0:
disc_actions = actions.discrete_tensor
q1_stream = self._condense_q_streams(q1_out, disc_actions)
q2_stream = self._condense_q_streams(q2_out, disc_actions)
else:
else:
# For discrete, you don't need to backprop through the Q for the policy
q1p_out, q2p_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q1_grad=False,
q2_grad=False,
)
q1_out, q2_out = self.value_network(
vec_obs,
vis_obs,
memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions)
q2_stream = self._condense_q_streams(q2_out, actions)
with torch.no_grad():
target_values, _ = self.target_network(

sequence_length=self.policy.sequence_length,
)
masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
use_discrete = not self.policy.use_continuous_act
dones = ModelUtils.list_to_tensor(batch["done"])
q1_loss, q2_loss = self.sac_q_loss(

log_probs, value_estimates, q1p_out, q2p_out, masks, use_discrete
log_probs, value_estimates, q1p_out, q2p_out, masks
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete)
entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete)
policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
entropy_loss = self.sac_entropy_loss(log_probs, masks)
total_value_loss = q1_loss + q2_loss + value_loss

"Losses/Value Loss": value_loss.item(),
"Losses/Q1 Loss": q1_loss.item(),
"Losses/Q2 Loss": q2_loss.item(),
"Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(),
"Policy/Discrete Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.discrete)
).item(),
"Policy/Continuous Entropy Coeff": torch.mean(
torch.exp(self._log_ent_coef.continuous)
).item(),
"Policy/Learning Rate": decay_lr,
}

2
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
self.env.set_actions(brain_name, action_info.action)
self.env.set_actions(brain_name, action_info.env_action)
self.env.step()
all_step_result = self._generate_all_results()

4
ml-agents/mlagents/trainers/subprocess_env_manager.py


if req.cmd == EnvironmentCommand.STEP:
all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
env.set_actions(brain_name, action_info.action)
if len(action_info.agent_ids) > 0:
env.set_actions(brain_name, action_info.env_action)
env.step()
all_step_result = _generate_all_results()
# The timers in this process are independent from all the processes and the "main" process

24
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents_envs.base_env import (
DecisionSteps,

ActionTuple,
)

steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

action = np.zeros(action_size, dtype=np.float32)
action_pre = np.zeros(action_size, dtype=np.float32)
action = ActionTuple(
continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
)
action_probs = LogProbsTuple(
continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
)
action_mask = (
[
[False for _ in range(branch)]

else None
)
prev_action = np.ones(action_size, dtype=np.float32)
if action_spec.is_discrete():
prev_action = np.ones(action_size, dtype=np.int32)
else:
prev_action = np.ones(action_size, dtype=np.float32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"

done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
interrupted=max_step,

done=not max_step_complete,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action,
interrupted=max_step_complete,

82
ml-agents/mlagents/trainers/tests/simple_test_envs.py


from mlagents_envs.base_env import (
ActionSpec,
ActionTuple,
BaseEnv,
BehaviorSpec,
DecisionSteps,

OBS_SIZE = 1
VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
STEP_SIZE = 0.2
TIME_PENALTY = 0.01
MIN_STEPS = int(1.0 / STEP_SIZE) + 1

def __init__(
self,
brain_names,
use_discrete,
action_size=1,
action_sizes=(1, 0),
self.discrete = use_discrete
if use_discrete:
action_spec = ActionSpec.create_discrete(
tuple(2 for _ in range(action_size))
)
else:
action_spec = ActionSpec.create_continuous(action_size)
continuous_action_size, discrete_action_size = action_sizes
discrete_tuple = tuple(2 for _ in range(discrete_action_size))
action_spec = ActionSpec(continuous_action_size, discrete_tuple)
self.total_action_size = (
continuous_action_size + discrete_action_size
) # to set the goals/positions
self.action_spec = action_spec
self.action_size = action_size
self.action_spec = action_spec
self.names = brain_names
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}

def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
_act = self.action[name]
if self.action_spec.continuous_size > 0:
for _cont in _act.continuous[0]:
deltas.append(_cont)
if self.action_spec.discrete_size > 0:
for _disc in _act.discrete[0]:
deltas.append(1 if _disc else -1)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta

return done
def _generate_mask(self):
if self.discrete:
action_mask = None
if self.action_spec.discrete_size > 0:
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.array(
2 * self.action_spec.discrete_size * [False], dtype=np.bool
)
else:
action_mask = None
return action_mask
def _compute_reward(self, name: str, done: bool) -> float:

def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.positions[name] = [0.0 for _ in range(self.total_action_size)]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1

class MemoryEnvironment(SimpleEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=step_size)
def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2):
super().__init__(brain_names, action_sizes=action_sizes, step_size=step_size)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.num_show_steps = 2

def __init__(
self,
brain_names,
use_discrete,
action_sizes=(1, 0),
use_discrete,
action_sizes=action_sizes,
)
self.demonstration_protos: Dict[str, List[AgentInfoActionPairProto]] = {}
self.n_demos = n_demos

def step(self) -> None:
super().step()
for name in self.names:
discrete_actions = (
self.action[name].discrete
if self.action_spec.discrete_size > 0
else None
)
continuous_actions = (
self.action[name].continuous
if self.action_spec.continuous_size > 0
else None
)
self.step_result[name][0], self.step_result[name][1], self.action[name]
self.step_result[name][0],
self.step_result[name][1],
continuous_actions,
discrete_actions,
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

self.reset()
for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
if self.action_spec.discrete_size > 0:
self.action[name] = ActionTuple(
np.array([], dtype=np.float32),
np.array(
[[1]] if self.goal[name] > 0 else [[0]], dtype=np.int32
),
)
self.action[name] = [[float(self.goal[name])]]
self.action[name] = ActionTuple(
np.array([[float(self.goal[name])]], dtype=np.float32),
np.array([], dtype=np.int32),
)
self.step()

66
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

use_visual=False,
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if use_discrete:
n_agents = len(buffer["discrete_log_probs"])
buffer["discrete_log_probs"].reset_field()
for _ in range(n_agents):
buffer["discrete_log_probs"].append(
np.ones(
int(sum(mock_behavior_spec.action_spec.discrete_branches)),
dtype=np.float32,
)
)
else:
n_agents = len(buffer["continuous_log_probs"])
buffer["continuous_log_probs"].reset_field()
for _ in range(n_agents):
buffer["continuous_log_probs"].append(
np.ones(
mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
)
)
trainer.update_buffer = buffer
trainer._update_policy()

128
ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py


assert all(reward > success_threshold for reward in processed_rewards)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_ppo(num_visual, use_discrete):
def test_visual_ppo(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

PPO_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=300,
max_steps=400,
summary_freq=100,
framework=FrameworkType.TENSORFLOW,
)

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_ppo(action_sizes):
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
PPO_TF_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(SAC_TF_CONFIG, framework=FrameworkType.TENSORFLOW)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
config = attr.evolve(
SAC_TF_CONFIG, framework=FrameworkType.TENSORFLOW, max_steps=900
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
config = attr.evolve(
SAC_TF_CONFIG,

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_sac(num_visual, use_discrete):
def test_visual_sac(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

SAC_TF_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
max_steps=200,
framework=FrameworkType.TENSORFLOW,
)
# The number of steps is pretty small for these encoders

@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.2 if use_discrete else 0.5
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_sac(action_sizes):
step_size = 0.2 if action_sizes == (0, 1) else 0.5
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
[BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TF_CONFIG.network_settings,

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000

_check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.

)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,

_check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
def record_demo(action_sizes, num_visual=0, num_vector=1):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,

env.solve()
continuous_size, discrete_size = action_sizes
use_discrete = True if discrete_size > 0 else False
vector_action_size=[2] if use_discrete else [1],
vector_action_descriptions=[""],
vector_action_space_type=discrete if use_discrete else continuous,
vector_action_size_deprecated=[2] if use_discrete else [1],
vector_action_descriptions_deprecated=[""],
vector_action_space_type_deprecated=discrete
if use_discrete
else continuous,
brain_name=BRAIN_NAME,
is_training=True,
)

return record_demo
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
def test_gail(simple_record, action_sizes, trainer_config):
demo_path = simple_record(action_sizes)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_ppo(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)

_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_sac(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)

12
ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py


[], np.array([], dtype=np.float32), np.array([0]), None
)
result = policy.get_action(step_with_agents, worker_id=0)
assert result == ActionInfo(None, None, {}, [0])
assert result == ActionInfo(None, None, None, {}, [0])
def test_take_action_returns_action_info_when_available():

policy_eval_out = {
"action": np.array([1.0], dtype=np.float32),
"action": np.array([[1.0]], dtype=np.float32),
"pre_action": np.array([[1.0]], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}

)
result = policy.get_action(step_with_agents)
print(result)
policy_eval_out["action"], policy_eval_out["value"], policy_eval_out, [0]
policy_eval_out["action"],
policy_eval_out["env_action"],
policy_eval_out["value"],
policy_eval_out,
[0],
)
assert result == expected

41
ml-agents/mlagents/trainers/tests/test_agent_processor.py


AgentManagerQueue,
)
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
from mlagents_envs.base_env import ActionSpec
from mlagents_envs.base_env import ActionSpec, ActionTuple
def create_mock_policy():

mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

)
fake_action_outputs = {
"action": [0.1, 0.1],
"action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
"pre_action": [0.1, 0.1],
"log_probs": [0.1, 0.1],
"log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
}
mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
num_agents=2,

fake_action_info = ActionInfo(
action=[0.1, 0.1],
action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
value=[0.1, 0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_steps.agent_id,

action_spec=ActionSpec.create_continuous(2),
)
processor.add_experiences(
mock_decision_steps, mock_terminal_steps, 0, ActionInfo([], [], {}, [])
mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
)
# Assert that the AgentProcessor is still empty
assert len(processor.experience_buffers[0]) == 0

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": [0.1],
"action": ActionTuple(continuous=np.array([[0.1]])),
"pre_action": [0.1],
"log_probs": [0.1],
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

done=True,
)
fake_action_info = ActionInfo(
action=[0.1],
action=ActionTuple(continuous=np.array([[0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

processor.add_experiences(
mock_decision_step, mock_terminal_step, _ep, fake_action_info
)
add_calls.append(mock.call([get_global_agent_id(_ep, 0)], [0.1]))
add_calls.append(
mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"])
)
processor.add_experiences(
mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
)

max_trajectory_length=5,
stats_reporter=StatsReporter("testcat"),
)
"action": [0.1],
"action": ActionTuple(continuous=np.array([[0.1]])),
"pre_action": [0.1],
"log_probs": [0.1],
"log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
mock_decision_step, mock_terminal_step = mb.create_mock_steps(
num_agents=1,
observation_shapes=[(8,)],

action=[0.1],
action=ActionTuple(continuous=np.array([[0.1]])),
env_action=ActionTuple(continuous=np.array([[0.1]])),
value=[0.1],
outputs=fake_action_outputs,
agent_ids=mock_decision_step.agent_id,

10
ml-agents/mlagents/trainers/tests/test_demo_loader.py


assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_load_demo_dir():

assert len(pair_infos) == total_expected
_, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1, BEHAVIOR_SPEC)
assert len(demo_buffer["actions"]) == total_expected - 1
assert (
len(demo_buffer["continuous_action"]) == total_expected - 1
or len(demo_buffer["discrete_action"]) == total_expected - 1
)
def test_demo_mismatch():

2
ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py


@pytest.mark.parametrize("num_envs", [1, 4])
def test_subprocess_env_endtoend(num_envs):
def simple_env_factory(worker_id, config):
env = SimpleEnvironment(["1D"], use_discrete=True)
env = SimpleEnvironment(["1D"], action_sizes=(0, 1))
return env
env_manager = SubprocessEnvManager(

7
ml-agents/mlagents/trainers/tests/test_trajectory.py


"memory",
"masks",
"done",
"actions_pre",
"actions",
"action_probs",
"continuous_action",
"discrete_action",
"continuous_log_probs",
"discrete_log_probs",
"action_mask",
"prev_action",
"environment_rewards",

13
ml-agents/mlagents/trainers/tests/torch/saver/test_saver.py


).unsqueeze(0)
with torch.no_grad():
_, _, log_probs1, _, _ = policy1.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
_, log_probs1, _, _ = policy1.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
_, _, log_probs2, _, _ = policy2.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True
_, log_probs2, _, _ = policy2.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
np.testing.assert_array_equal(log_probs1, log_probs2)
np.testing.assert_array_equal(
log_probs1.all_discrete_tensor, log_probs2.all_discrete_tensor
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

2
ml-agents/mlagents/trainers/tests/torch/test_distributions.py


optimizer = torch.optim.Adam(gauss_dist.parameters(), lr=3e-3)
for _ in range(50):
dist_inst = gauss_dist(sample_embedding)[0]
dist_inst = gauss_dist(sample_embedding)
if tanh_squash:
assert isinstance(dist_inst, TanhGaussianDistInstance)
else:

90
ml-agents/mlagents/trainers/tests/torch/test_networks.py


from mlagents.trainers.torch.networks import (
NetworkBody,
ValueNetwork,
SimpleActor,
from mlagents.trainers.torch.distributions import (
GaussianDistInstance,
CategoricalDistInstance,
)
from mlagents.trainers.tests.torch.test_encoders import compare_models
def test_networkbody_vector():

assert _out[0] == pytest.approx(1.0, abs=0.1)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_actor(use_discrete):
obs_size = 4
network_settings = NetworkSettings()
obs_shapes = [(obs_size,)]
act_size = [2]
if use_discrete:
masks = torch.ones((1, 1))
action_spec = ActionSpec.create_discrete(tuple(act_size))
else:
masks = None
action_spec = ActionSpec.create_continuous(act_size[0])
actor = SimpleActor(obs_shapes, network_settings, action_spec)
# Test get_dist
sample_obs = torch.ones((1, obs_size))
dists, _ = actor.get_dists([sample_obs], [], masks=masks)
for dist in dists:
if use_discrete:
assert isinstance(dist, CategoricalDistInstance)
else:
assert isinstance(dist, GaussianDistInstance)
# Test sample_actions
actions = actor.sample_action(dists)
for act in actions:
if use_discrete:
assert act.shape == (1, 1)
else:
assert act.shape == (1, act_size[0])
# Test forward
actions, ver_num, mem_size, is_cont, act_size_vec = actor.forward(
[sample_obs], [], masks=masks
)
for act in actions:
# This is different from above for ONNX export
if use_discrete:
assert act.shape == tuple(act_size)
else:
assert act.shape == (act_size[0], 1)
assert mem_size == 0
assert is_cont == int(not use_discrete)
assert act_size_vec == torch.tensor(act_size)
memory=NetworkSettings.MemorySettings() if lstm else None
memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True
act_size = [2]
act_size = 2
mask = torch.ones([1, act_size * 2])
action_spec = ActionSpec.create_continuous(act_size[0])
# action_spec = ActionSpec.create_continuous(act_size[0])
action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
if lstm:
sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))

else:
assert value_out[stream].shape == (1,)
# Test get_dist_and_value
dists, value_out, mem_out = actor.get_dist_and_value(
[sample_obs], [], memories=memories
# Test get action stats and_value
action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
[sample_obs], [], memories=memories, masks=mask
if lstm:
assert action.continuous_tensor.shape == (64, 2)
else:
assert action.continuous_tensor.shape == (1, 2)
assert len(action.discrete_list) == 2
for _disc in action.discrete_list:
if lstm:
assert _disc.shape == (64, 1)
else:
assert _disc.shape == (1, 1)
for dist in dists:
assert isinstance(dist, GaussianDistInstance)
# Test normalization
actor.update_normalization(sample_obs)
if isinstance(actor, SeparateActorCritic):
for act_proc, crit_proc in zip(
actor.network_body.vector_processors,
actor.critic.network_body.vector_processors,
):
assert compare_models(act_proc, crit_proc)

28
ml-agents/mlagents/trainers/tests/torch/test_policy.py


from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8

run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
run_out["action"].discrete.shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
assert run_out["action"].continuous.shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
if policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long)
agent_action = AgentAction.from_dict(buffer)
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_processors):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])

vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
actions=agent_action,
memories=memories,
seq_len=policy.sequence_length,
)

_size = policy.behavior_spec.action_spec.continuous_size
assert log_probs.shape == (64, _size)
assert log_probs.flatten().shape == (64, _size)
assert entropy.shape == (64,)
for val in values.values():
assert val.shape == (64,)

if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
(
sampled_actions,
clipped_actions,
log_probs,
entropies,
memories,
) = policy.sample_actions(
(sampled_actions, log_probs, entropies, memories) = policy.sample_actions(
all_log_probs=not policy.use_continuous_act,
assert log_probs.shape == (
assert log_probs.all_discrete_tensor.shape == (
assert log_probs.shape == (64, policy.behavior_spec.action_spec.continuous_size)
assert clipped_actions.shape == (
assert log_probs.continuous_tensor.shape == (
64,
policy.behavior_spec.action_spec.continuous_size,
)

15
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
return_stats = optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
target = torch.tensor(buffer["continuous_action"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

11
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


np.random.normal(size=shape).astype(np.float32)
for shape in behavior_spec.observation_shapes
]
action = behavior_spec.action_spec.random_action(1)[0, :]
action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if behavior_spec.action_spec.continuous_size > 0:
action["continuous_action"] = action_buffer.continuous
if behavior_spec.action_spec.discrete_size > 0:
action["discrete_action"] = action_buffer.discrete
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

3
ml-agents/mlagents/trainers/tests/torch/test_sac.py


"Losses/Value Loss",
"Losses/Q1 Loss",
"Losses/Q2 Loss",
"Policy/Entropy Coeff",
"Policy/Continuous Entropy Coeff",
"Policy/Discrete Entropy Coeff",
"Policy/Learning Rate",
]
for stat in required_stats:

132
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
from mlagents_envs.communicator_objects.brain_parameters_pb2 import (
BrainParametersProto,
ActionSpecProto,
)
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import (

SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)

check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_ppo(num_visual, use_discrete):
def test_visual_ppo(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_ppo(action_sizes):
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
def test_2d_sac(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_visual_sac(num_visual, use_discrete):
def test_visual_sac(num_visual, action_sizes):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=0,
step_size=0.2,

def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
action_sizes=(0, 1),
num_visual=num_visual,
num_vector=0,
step_size=0.5,

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.2 if use_discrete else 0.5
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_recurrent_sac(action_sizes):
step_size = 0.2 if action_sizes == (0, 1) else 0.5
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
[BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings,

check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000

@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.

)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,

check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_asymm_ghost_fails(action_sizes):
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.

@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
def record_demo(action_sizes, num_visual=0, num_vector=1):
use_discrete=use_discrete,
action_sizes=action_sizes,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,

env.solve()
agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
continuous_action_size, discrete_action_size = action_sizes
action_spec_proto = ActionSpecProto(
num_continuous_actions=continuous_action_size,
num_discrete_actions=discrete_action_size,
discrete_branch_sizes=[2] if discrete_action_size > 0 else None,
)
vector_action_size=[2] if use_discrete else [1],
vector_action_descriptions=[""],
vector_action_space_type=discrete if use_discrete else continuous,
brain_name=BRAIN_NAME,
is_training=True,
brain_name=BRAIN_NAME, is_training=True, action_spec=action_spec_proto
action_type = "Discrete" if use_discrete else "Continuous"
action_type = "Discrete" if action_sizes else "Continuous"
demo_path_name = "1DTest" + action_type + ".demo"
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)

@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
def test_gail(simple_record, action_sizes, trainer_config):
demo_path = simple_record(action_sizes)
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_ppo(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)

check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_gail_visual_sac(simple_record, action_sizes):
demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
use_discrete=use_discrete,
action_sizes=action_sizes,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)

44
ml-agents/mlagents/trainers/tests/torch/test_utils.py


from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.torch.encoders import VectorInput
from mlagents.trainers.torch.distributions import (
CategoricalDistInstance,
GaussianDistInstance,
)
def test_min_visual_size():

]
for res, exp in zip(oh_actions, expected_result):
assert torch.equal(res, exp)
def test_get_probs_and_entropy():
# Test continuous
# Add two dists to the list. This isn't done in the code but we'd like to support it.
dist_list = [
GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
GaussianDistInstance(torch.zeros((1, 2)), torch.ones((1, 2))),
]
action_list = [torch.zeros((1, 2)), torch.zeros((1, 2))]
log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert log_probs.shape == (1, 2, 2)
assert entropies.shape == (1, 1, 2)
assert all_probs is None
for log_prob in log_probs.flatten():
# Log prob of standard normal at 0
assert log_prob == pytest.approx(-0.919, abs=0.01)
for ent in entropies.flatten():
# entropy of standard normal at 0
assert ent == pytest.approx(1.42, abs=0.01)
# Test continuous
# Add two dists to the list.
act_size = 2
test_prob = torch.tensor(
[[1.0 - 0.1 * (act_size - 1)] + [0.1] * (act_size - 1)]
) # High prob for first action
dist_list = [CategoricalDistInstance(test_prob), CategoricalDistInstance(test_prob)]
action_list = [torch.tensor([0]), torch.tensor([1])]
log_probs, entropies, all_probs = ModelUtils.get_probs_and_entropy(
action_list, dist_list
)
assert all_probs.shape == (1, len(dist_list * act_size))
assert entropies.shape == (1, len(dist_list))
# Make sure the first action has high probability than the others.
assert log_probs.flatten()[0] > log_probs.flatten()[1]
def test_masked_mean():

6
ml-agents/mlagents/trainers/tf/components/bc/module.py


self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
feed_dict[self.model.action_in_expert] = mini_batch_demo["discrete_action"]
feed_dict[self.policy.action_masks] = np.ones(
(
self.n_sequences * self.policy.sequence_length,

)
else:
feed_dict[self.model.action_in_expert] = mini_batch_demo[
"continuous_action"
]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):

10
ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.policy.use_vec_obs:

feed_dict[self.model.next_visual_in[i]] = _next_obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

policy.mask_input: mini_batch["masks"],
}
if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]

17
ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py


def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
}
if self.model.use_vail:

feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

if self.model.use_vail:
feed_dict[self.model.use_noise] = [1]
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["continuous_action"]
)
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["discrete_action"]
)
if self.policy.use_vis_obs > 0:
for i in range(len(policy.visual_in)):

52
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.torch.utils import ModelUtils

update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
return update_stats
def _behavioral_cloning_loss(self, selected_actions, log_probs, expert_actions):
if self.policy.use_continuous_act:
bc_loss = torch.nn.functional.mse_loss(selected_actions, expert_actions)
else:
def _behavioral_cloning_loss(
self,
selected_actions: AgentAction,
log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
) -> torch.Tensor:
bc_loss = 0
if self.policy.behavior_spec.action_spec.continuous_size > 0:
bc_loss += torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions.continuous_tensor
)
if self.policy.behavior_spec.action_spec.discrete_size > 0:
one_hot_expert_actions = ModelUtils.actions_to_onehot(
expert_actions.discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
)
log_probs, self.policy.act_size
log_probs.all_discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
bc_loss = torch.mean(
bc_loss += torch.mean(
torch.stack(
[
torch.sum(

)
for log_prob_branch, expert_actions_branch in zip(
log_prob_branches, expert_actions
log_prob_branches, one_hot_expert_actions
)
]
)

"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])
else:
raw_expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["actions"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size
)
expert_actions = AgentAction.from_dict(mini_batch_demo)
if self.policy.behavior_spec.action_spec.discrete_size > 0:
act_masks = ModelUtils.list_to_tensor(
np.ones(
(

else:
vis_obs = []
(
selected_actions,
clipped_actions,
all_log_probs,
_,
_,
) = self.policy.sample_actions(
selected_actions, log_probs, _, _ = self.policy.sample_actions(
all_log_probs=True,
clipped_actions, all_log_probs, expert_actions
selected_actions, log_probs, expert_actions
)
self.optimizer.zero_grad()
bc_loss.backward()

78
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


import numpy as np
from typing import Dict
from typing import Dict, NamedTuple
from mlagents.torch_utils import torch, default_device
from mlagents.trainers.buffer import AgentBuffer

from mlagents.trainers.settings import CuriositySettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_flattener import ActionFlattener
class ActionPredictionTuple(NamedTuple):
continuous: torch.Tensor
discrete: torch.Tensor
class CuriosityRewardProvider(BaseRewardProvider):

specs.observation_shapes, state_encoder_settings
)
self._action_flattener = ModelUtils.ActionFlattener(self._action_spec)
self._action_flattener = ActionFlattener(self._action_spec)
self.inverse_model_action_prediction = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256),
linear_layer(256, self._action_flattener.flattened_size),
self.inverse_model_action_encoding = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256)
if self._action_spec.continuous_size > 0:
self.continuous_action_prediction = linear_layer(
256, self._action_spec.continuous_size
)
if self._action_spec.discrete_size > 0:
self.discrete_action_prediction = linear_layer(
256, sum(self._action_spec.discrete_branches)
)
self.forward_model_next_state_prediction = torch.nn.Sequential(
LinearEncoder(
settings.encoding_size + self._action_flattener.flattened_size, 1, 256

)
return hidden
def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor:
def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple:
"""
In the continuous case, returns the predicted action.
In the discrete case, returns the logits.

)
hidden = self.inverse_model_action_prediction(inverse_model_input)
if self._action_spec.is_continuous():
return hidden
else:
continuous_pred = None
discrete_pred = None
hidden = self.inverse_model_action_encoding(inverse_model_input)
if self._action_spec.continuous_size > 0:
continuous_pred = self.continuous_action_prediction(hidden)
if self._action_spec.discrete_size > 0:
raw_discrete_pred = self.discrete_action_prediction(hidden)
hidden, self._action_spec.discrete_branches
raw_discrete_pred, self._action_spec.discrete_branches
return torch.cat(branches, dim=1)
discrete_pred = torch.cat(branches, dim=1)
return ActionPredictionTuple(continuous_pred, discrete_pred)
def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
"""

if self._action_spec.is_continuous():
action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
else:
action = torch.cat(
ModelUtils.actions_to_onehot(
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._action_spec.discrete_branches,
),
dim=1,
)
actions = AgentAction.from_dict(mini_batch)
flattened_action = self._action_flattener.forward(actions)
(self.get_current_state(mini_batch), action), dim=1
(self.get_current_state(mini_batch), flattened_action), dim=1
)
return self.forward_model_next_state_prediction(forward_model_input)

action prediction (given the current and next state).
"""
predicted_action = self.predict_action(mini_batch)
if self._action_spec.is_continuous():
actions = AgentAction.from_dict(mini_batch)
_inverse_loss = 0
if self._action_spec.continuous_size > 0:
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
- predicted_action
actions.continuous_tensor - predicted_action.continuous
return torch.mean(
_inverse_loss += torch.mean(
ModelUtils.dynamic_partition(
sq_difference,
ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),

else:
if self._action_spec.discrete_size > 0:
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
self._action_spec.discrete_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
-torch.log(predicted_action + self.EPSILON) * true_action, dim=1
-torch.log(predicted_action.discrete + self.EPSILON) * true_action,
dim=1,
return torch.mean(
_inverse_loss += torch.mean(
ModelUtils.dynamic_partition(
cross_entropy,
ModelUtils.list_to_tensor(

)[1]
)
return _inverse_loss
def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor:
"""

8
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


from mlagents.trainers.settings import GAILSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_flattener import ActionFlattener
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType

vis_encode_type=EncoderType.SIMPLE,
memory=None,
)
self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
self._action_flattener = ActionFlattener(specs.action_spec)
unencoded_size = (
self._action_flattener.flattened_size + 1 if settings.use_actions else 0
) # +1 is for dones

Creates the action Tensor. In continuous case, corresponds to the action. In
the discrete case, corresponds to the concatenation of one hot action Tensors.
"""
return self._action_flattener.forward(
torch.as_tensor(mini_batch["actions"], dtype=torch.float)
)
return self._action_flattener.forward(AgentAction.from_dict(mini_batch))
def get_state_inputs(
self, mini_batch: AgentBuffer

21
ml-agents/mlagents/trainers/torch/distributions.py


"""
pass
@abc.abstractmethod
def exported_model_output(self) -> torch.Tensor:
"""
Returns the tensor to be exported to ONNX for the distribution
"""
pass
class DiscreteDistInstance(DistInstance):
@abc.abstractmethod

dim=1,
keepdim=True,
) # Use equivalent behavior to TF
def exported_model_output(self):
return self.sample()
class TanhGaussianDistInstance(GaussianDistInstance):

return torch.log(self.probs + EPSILON)
def entropy(self):
return -torch.sum(self.probs * torch.log(self.probs + EPSILON), dim=-1)
return -torch.sum(
self.probs * torch.log(self.probs + EPSILON), dim=-1
).unsqueeze(-1)
def exported_model_output(self):
return self.all_log_prob()
class GaussianDistribution(nn.Module):

# verified version of Barracuda (1.0.2).
log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
if self.tanh_squash:
return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
return [GaussianDistInstance(mu, torch.exp(log_sigma))]
return GaussianDistInstance(mu, torch.exp(log_sigma))
class MultiCategoricalDistribution(nn.Module):

31
ml-agents/mlagents/trainers/torch/model_serialization.py


+ [f"visual_observation_{i}" for i in range(self.policy.vis_obs_size)]
+ ["action_masks", "memories"]
)
self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}
self.output_names = [
"action",
"version_number",
"memory_size",
"is_continuous_control",
"action_output_shape",
]
self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}
self.dynamic_axes.update({"action": {0: "batch"}})
self.output_names = ["version_number", "memory_size"]
if self.policy.behavior_spec.action_spec.continuous_size > 0:
self.output_names += [
"continuous_actions",
"continuous_action_output_shape",
]
self.dynamic_axes.update({"continuous_actions": {0: "batch"}})
if self.policy.behavior_spec.action_spec.discrete_size > 0:
self.output_names += ["discrete_actions", "discrete_action_output_shape"]
self.dynamic_axes.update({"discrete_actions": {0: "batch"}})
if (
self.policy.behavior_spec.action_spec.continuous_size == 0
or self.policy.behavior_spec.action_spec.discrete_size == 0
):
self.output_names += [
"action",
"is_continuous_control",
"action_output_shape",
]
self.dynamic_axes.update({"action": {0: "batch"}})
def export_policy_model(self, output_filepath: str) -> None:
"""

226
ml-agents/mlagents/trainers/torch/networks.py


from typing import Callable, List, Dict, Tuple, Optional
from typing import Callable, List, Dict, Tuple, Optional, Union
from mlagents.trainers.torch.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
DistInstance,
)
from mlagents.trainers.torch.action_model import ActionModel
from mlagents.trainers.torch.agent_action import AgentAction
from mlagents.trainers.torch.action_log_probs import ActionLogProbs
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.decoders import ValueHeads

else 0
)
self.visual_processors, self.vector_processors, encoder_input_size = ModelUtils.create_input_processors(
(
self.visual_processors,
self.vector_processors,
encoder_input_size,
) = ModelUtils.create_input_processors(
observation_shapes,
self.h_size,
network_settings.vis_encode_type,

pass
@abc.abstractmethod
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
"""
Takes a List of Distribution iinstances and samples an action from each.
"""
pass
@abc.abstractmethod
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
"""
Returns distributions from this Actor, from which actions can be sampled.
If memory is enabled, return the memories as well.
:param vec_inputs: A List of vector inputs as tensors.
:param vis_inputs: A List of visual inputs as tensors.
:param masks: If using discrete actions, a Tensor of action masks.
:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, and memories.
Memories will be None if not using memory.
"""
pass
@abc.abstractmethod
def forward(
self,
vec_inputs: List[torch.Tensor],

) -> Tuple[torch.Tensor, int, int, int, int]:
) -> Tuple[Union[int, torch.Tensor], ...]:
"""
Forward pass of the Actor for inference. This is required for export to ONNX, and
the inputs and outputs of this method should not be changed without a respective change

pass
@abc.abstractmethod
def get_dist_and_value(
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],

) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
"""
Returns distributions, from which actions can be sampled, and value estimates.
If memory is enabled, return the memories as well.

:param memories: If using memory, a Tensor of initial memories.
:param sequence_length: If using memory, the sequence length.
:return: A Tuple of a List of action distribution instances, a Dict of reward signal
:return: A Tuple of AgentAction, ActionLogProbs, entropies, Dict of reward signal
name to value estimate, and memories. Memories will be None if not using memory.
"""
pass

super().__init__()
self.action_spec = action_spec
self.version_number = torch.nn.Parameter(torch.Tensor([2.0]))
self.is_continuous_int = torch.nn.Parameter(
self.is_continuous_int_deprecated = torch.nn.Parameter(
self.act_size_vector = torch.nn.Parameter(
self.continuous_act_size_vector = torch.nn.Parameter(
torch.Tensor([int(self.action_spec.continuous_size)]), requires_grad=False
)
# TODO: export list of branch sizes instead of sum
self.discrete_act_size_vector = torch.nn.Parameter(
torch.Tensor([sum(self.action_spec.discrete_branches)]), requires_grad=False
)
self.act_size_vector_deprecated = torch.nn.Parameter(
torch.Tensor(
[
self.action_spec.continuous_size

else:
self.encoding_size = network_settings.hidden_units
if self.action_spec.is_continuous():
self.distribution = GaussianDistribution(
self.encoding_size,
self.action_spec.continuous_size,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
else:
self.distribution = MultiCategoricalDistribution(
self.encoding_size, self.action_spec.discrete_branches
)
# During training, clipping is done in TorchPolicy, but we need to clip before ONNX
# export as well.
self._clip_action_on_export = not tanh_squash
self.action_model = ActionModel(
self.encoding_size,
action_spec,
conditional_sigma=conditional_sigma,
tanh_squash=tanh_squash,
)
@property
def memory_size(self) -> int:

self.network_body.update_normalization(vector_obs)
def sample_action(self, dists: List[DistInstance]) -> List[torch.Tensor]:
actions = []
for action_dist in dists:
action = action_dist.sample()
actions.append(action)
return actions
def get_dists(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[List[DistInstance], Optional[torch.Tensor]]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
if self.action_spec.is_continuous():
dists = self.distribution(encoding)
else:
dists = self.distribution(encoding, masks)
return dists, memories
def forward(
self,
vec_inputs: List[torch.Tensor],

) -> Tuple[torch.Tensor, int, int, int, int]:
) -> Tuple[Union[int, torch.Tensor], ...]:
At this moment, torch.onnx.export() doesn't accept None as tensor to be exported,
so the size of return tuple varies with action spec.
dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
if self.action_spec.is_continuous():
action_list = self.sample_action(dists)
action_out = torch.stack(action_list, dim=-1)
if self._clip_action_on_export:
action_out = torch.clamp(action_out, -3, 3) / 3
else:
action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1)
return (
action_out,
encoding, memories_out = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=1
)
(
cont_action_out,
disc_action_out,
action_out_deprecated,
) = self.action_model.get_action_out(encoding, masks)
export_out = [
self.is_continuous_int,
self.act_size_vector,
)
]
if self.action_spec.continuous_size > 0:
export_out += [cont_action_out, self.continuous_act_size_vector]
if self.action_spec.discrete_size > 0:
export_out += [disc_action_out, self.discrete_act_size_vector]
# Only export deprecated nodes with non-hybrid action spec
if self.action_spec.continuous_size == 0 or self.action_spec.discrete_size == 0:
export_out += [
action_out_deprecated,
self.is_continuous_int_deprecated,
self.act_size_vector_deprecated,
]
return tuple(export_out)
class SharedActorCritic(SimpleActor, ActorCritic):

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,
network_settings,

)
return self.value_heads(encoding), memories_out
def get_dist_and_value(
def get_stats_and_value(
actions: AgentAction,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
if self.action_spec.is_continuous():
dists = self.distribution(encoding)
else:
dists = self.distribution(encoding, masks=masks)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs = self.value_heads(encoding)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
return dists, value_outputs, memories
return action, log_probs, entropies, value_outputs, memories
class SeparateActorCritic(SimpleActor, ActorCritic):

conditional_sigma: bool = False,
tanh_squash: bool = False,
):
# Give the Actor only half the memories. Note we previously validate
# that memory_size must be a multiple of 4.
self.use_lstm = network_settings.memory is not None
super().__init__(
observation_shapes,

memories_out = None
return value_outputs, memories_out
def get_dist_and_value(
def get_stats_and_value(
actions: AgentAction,
) -> Tuple[List[DistInstance], Dict[str, torch.Tensor], torch.Tensor]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)

dists, actor_mem_outs = self.get_dists(
vec_inputs,
vis_inputs,
memories=actor_mem,
sequence_length=sequence_length,
masks=masks,
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)
return log_probs, entropies, value_outputs
def get_action_stats_and_value(
self,
vec_inputs: List[torch.Tensor],
vis_inputs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
sequence_length: int = 1,
) -> Tuple[
AgentAction, ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
if self.use_lstm:
# Use only the back half of memories for critic and actor
actor_mem, critic_mem = torch.split(memories, self.memory_size // 2, dim=-1)
else:
critic_mem = None
actor_mem = None
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(
vec_inputs, vis_inputs, memories=critic_mem, sequence_length=sequence_length
)

mem_out = None
return dists, value_outputs, mem_out
return action, log_probs, entropies, value_outputs, mem_out
def update_normalization(self, vector_obs: List[torch.Tensor]) -> None:
super().update_normalization(vector_obs)

48
ml-agents/mlagents/trainers/torch/utils.py


)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
class ModelUtils:

EncoderType.NATURE_CNN: 36,
EncoderType.RESNET: 15,
}
class ActionFlattener:
def __init__(self, action_spec: ActionSpec):
self._specs = action_spec
@property
def flattened_size(self) -> int:
if self._specs.is_continuous():
return self._specs.continuous_size
else:
return sum(self._specs.discrete_branches)
def forward(self, action: torch.Tensor) -> torch.Tensor:
if self._specs.is_continuous():
return action
else:
return torch.cat(
ModelUtils.actions_to_onehot(
torch.as_tensor(action, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,
)
@staticmethod
def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:

for i in range(num_partitions):
res += [data[(partitions == i).nonzero().squeeze(1)]]
return res
@staticmethod
def get_probs_and_entropy(
action_list: List[torch.Tensor], dists: List[DistInstance]
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
log_probs_list = []
all_probs_list = []
entropies_list = []
for action, action_dist in zip(action_list, dists):
log_prob = action_dist.log_prob(action)
log_probs_list.append(log_prob)
entropies_list.append(action_dist.entropy())
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
log_probs = torch.stack(log_probs_list, dim=-1)
entropies = torch.stack(entropies_list, dim=-1)
if not all_probs_list:
log_probs = log_probs.squeeze(-1)
entropies = entropies.squeeze(-1)
all_probs = None
else:
all_probs = torch.cat(all_probs_list, dim=-1)
return log_probs, entropies, all_probs
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

28
ml-agents/mlagents/trainers/trajectory.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.base_env import ActionTuple
from mlagents.trainers.torch.action_log_probs import LogProbsTuple
class AgentExperience(NamedTuple):

action: np.ndarray
action_probs: np.ndarray
action_pre: np.ndarray # TODO: Remove this
action: ActionTuple
action_probs: LogProbsTuple
action_mask: np.ndarray
prev_action: np.ndarray
interrupted: bool

agent_buffer_trajectory["masks"].append(1.0)
agent_buffer_trajectory["done"].append(exp.done)
# Add the outputs of the last eval
if exp.action_pre is not None:
actions_pre = exp.action_pre
agent_buffer_trajectory["actions_pre"].append(actions_pre)
# value is a dictionary from name of reward to value estimate of the value head
agent_buffer_trajectory["actions"].append(exp.action)
agent_buffer_trajectory["action_probs"].append(exp.action_probs)
# Adds the log prob and action of continuous/discrete separately
agent_buffer_trajectory["continuous_action"].append(exp.action.continuous)
agent_buffer_trajectory["discrete_action"].append(exp.action.discrete)
agent_buffer_trajectory["continuous_log_probs"].append(
exp.action_probs.continuous
)
agent_buffer_trajectory["discrete_log_probs"].append(
exp.action_probs.discrete
)
# Store action masks if necessary. Note that 1 means active, while
# in AgentExperience False means active.

else:
# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
action_shape = exp.action.discrete.shape
np.ones(exp.action_probs.shape, dtype=np.float32), padding_value=1
np.ones(action_shape, dtype=np.float32), padding_value=1
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)

22
ml-agents/tests/yamato/scripts/run_llapi.py


import argparse
import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import (

episode_rewards = 0
tracked_agent = -1
while not done:
if group_spec.action_spec.is_continuous():
action = np.random.randn(
len(decision_steps), group_spec.action_spec.continuous_size
)
elif group_spec.action_spec.is_discrete():
branch_size = group_spec.action_spec.discrete_branches
action = np.column_stack(
[
np.random.randint(
0, branch_size[i], size=(len(decision_steps))
)
for i in range(len(branch_size))
]
)
else:
# Should never happen
action = None
action_tuple = group_spec.action_spec.random_action(len(decision_steps))
env.set_actions(group_name, action)
env.set_actions(group_name, action_tuple)
env.step()
decision_steps, terminal_steps = env.get_steps(group_name)
done = False

4
protobuf-definitions/proto/mlagents_envs/communicator_objects/agent_action.proto


package communicator_objects;
message AgentActionProto {
repeated float vector_actions = 1;
repeated float vector_actions_deprecated = 1; // mark as deprecated in communicator v1.3.0
repeated float continuous_actions = 6;
repeated int32 discrete_actions = 7;
}

14
protobuf-definitions/proto/mlagents_envs/communicator_objects/brain_parameters.proto


option csharp_namespace = "Unity.MLAgents.CommunicatorObjects";
package communicator_objects;
message ActionSpecProto {
int32 num_continuous_actions = 1;
int32 num_discrete_actions = 2;
repeated int32 discrete_branch_sizes = 3;
repeated string action_descriptions = 4;
}
repeated int32 vector_action_size = 3;
repeated int32 vector_action_size_deprecated = 3; // mark as deprecated in communicator v1.3.0
repeated string vector_action_descriptions = 5;
SpaceTypeProto vector_action_space_type = 6;
repeated string vector_action_descriptions_deprecated = 5; // mark as deprecated in communicator v1.3.0
SpaceTypeProto vector_action_space_type_deprecated = 6; // mark as deprecated in communicator v1.3.0
ActionSpecProto action_spec = 9;
}

3
protobuf-definitions/proto/mlagents_envs/communicator_objects/capabilities.proto


// compression mapping for stacking compressed observations.
bool compressedChannelMapping = 3;
// support for hybrid action spaces (discrete + continuous)
bool hybridActions = 4;
}

部分文件因为文件数量过多而无法显示

正在加载...
取消
保存