浏览代码

Merge remote-tracking branch 'origin/master' into develop-BehaviorParams-public

/bug-failed-api-check
Chris Elion 5 年前
当前提交
fa5e7e6d
共有 39 个文件被更改,包括 526 次插入1374 次删除
  1. 8
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  2. 8
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
  3. 4
      Project/Assets/ML-Agents/Examples/Basic/Scripts/BasicController.cs
  4. 12
      Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs
  5. 6
      Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs
  6. 7
      Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs
  7. 1001
      Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
  8. 12
      Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  9. 9
      Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
  10. 9
      Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
  11. 9
      Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs
  12. 6
      Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  13. 2
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/GroundContact.cs
  14. 2
      Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs
  15. 7
      Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs
  16. 2
      Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs
  17. 4
      Project/Assets/ML-Agents/Examples/Template/Scripts/TemplateAgent.cs
  18. 4
      Project/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs
  19. 6
      Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  20. 6
      Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
  21. 16
      Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs
  22. 9
      com.unity.ml-agents/CHANGELOG.md
  23. 56
      com.unity.ml-agents/Runtime/Agent.cs
  24. 10
      com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
  25. 21
      docs/Getting-Started-with-Balance-Ball.md
  26. 34
      docs/Learning-Environment-Create-New.md
  27. 38
      docs/Learning-Environment-Design-Agents.md
  28. 33
      docs/Learning-Environment-Design.md
  29. 14
      docs/Migrating.md
  30. 236
      docs/Python-API.md
  31. 1
      docs/Readme.md
  32. 2
      docs/Training-Curriculum-Learning.md
  33. 5
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  34. 5
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  35. 3
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  36. 16
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  37. 4
      ml-agents/mlagents/trainers/sac/optimizer.py
  38. 66
      ml-agents/mlagents/trainers/tests/test_ppo.py
  39. 207
      docs/Custom-SideChannels.md

8
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


Rigidbody m_BallRb;
FloatPropertiesChannel m_ResetParams;
public override void InitializeAgent()
public override void Initialize()
{
m_BallRb = ball.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.FloatProperties;

sensor.AddObservation(m_BallRb.velocity);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);

Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)
{
SetReward(-1f);
Done();
EndEpisode();
}
else
{

public override void AgentReset()
public override void OnEpisodeBegin()
{
gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
gameObject.transform.Rotate(new Vector3(1, 0, 0), Random.Range(-10f, 10f));

8
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs


Rigidbody m_BallRb;
FloatPropertiesChannel m_ResetParams;
public override void InitializeAgent()
public override void Initialize()
{
m_BallRb = ball.GetComponent<Rigidbody>();
m_ResetParams = Academy.Instance.FloatProperties;

sensor.AddObservation((ball.transform.position - gameObject.transform.position));
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);

Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)
{
SetReward(-1f);
Done();
EndEpisode();
}
else
{

public override void AgentReset()
public override void OnEpisodeBegin()
{
gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
gameObject.transform.Rotate(new Vector3(1, 0, 0), Random.Range(-10f, 10f));

4
Project/Assets/ML-Agents/Examples/Basic/Scripts/BasicController.cs


if (m_Position == k_SmallGoalPosition)
{
m_Agent.AddReward(0.1f);
m_Agent.Done();
m_Agent.EndEpisode();
ResetAgent();
}

m_Agent.Done();
m_Agent.EndEpisode();
ResetAgent();
}
}

12
Project/Assets/ML-Agents/Examples/Bouncer/Scripts/BouncerAgent.cs


FloatPropertiesChannel m_ResetParams;
public override void InitializeAgent()
public override void Initialize()
{
m_Rb = gameObject.GetComponent<Rigidbody>();
m_LookDir = Vector3.zero;

sensor.AddObservation(target.transform.localPosition);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
for (var i = 0; i < vectorAction.Length; i++)
{

m_LookDir = new Vector3(x, y, z);
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
gameObject.transform.localPosition = new Vector3(
(1 - 2 * Random.value) * 5, 2, (1 - 2 * Random.value) * 5);

if (gameObject.transform.position.y < -1)
{
AddReward(-1);
Done();
EndEpisode();
return;
}

AddReward(-1);
Done();
EndEpisode();
Done();
EndEpisode();
}
}

6
Project/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgent.cs


Quaternion m_LookRotation;
Matrix4x4 m_TargetDirMatrix;
public override void InitializeAgent()
public override void Initialize()
{
m_JdController = GetComponent<JointDriveController>();
m_DirToTarget = target.position - body.position;

target.position = newTargetPos + ground.position;
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
// The dictionary with all the body parts in it are in the jdController
var bpDict = m_JdController.bodyPartsDict;

/// <summary>
/// Loop over body parts and reset them to initial conditions.
/// </summary>
public override void AgentReset()
public override void OnEpisodeBegin()
{
if (m_DirToTarget != Vector3.zero)
{

7
Project/Assets/ML-Agents/Examples/FoodCollector/Scripts/FoodCollectorAgent.cs


public bool useVectorObs;
public override void InitializeAgent()
public override void Initialize()
base.InitializeAgent();
m_AgentRb = GetComponent<Rigidbody>();
m_MyArea = area.GetComponent<FoodCollectorArea>();
m_FoodCollecterSettings = FindObjectOfType<FoodCollectorSettings>();

gameObject.GetComponentInChildren<Renderer>().material = normalMaterial;
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
MoveAgent(vectorAction);
}

return action;
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
Unfreeze();
Unpoison();

1001
Project/Assets/ML-Agents/Examples/GridWorld/Demos/ExpertGrid.demo
文件差异内容过多而无法显示
查看文件

12
Project/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs


const int k_Left = 3;
const int k_Right = 4;
public override void InitializeAgent()
{
}
public override void CollectDiscreteActionMasks(DiscreteActionMasker actionMasker)
{
// Mask the necessary actions if selected by the user.

}
// to be implemented by the developer
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
AddReward(-0.01f);
var action = Mathf.FloorToInt(vectorAction[0]);

if (hit.Where(col => col.gameObject.CompareTag("goal")).ToArray().Length == 1)
{
SetReward(1f);
Done();
EndEpisode();
Done();
EndEpisode();
}
}
}

}
// to be implemented by the developer
public override void AgentReset()
public override void OnEpisodeBegin()
{
area.AreaReset();
}

9
Project/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs


HallwaySettings m_HallwaySettings;
int m_Selection;
public override void InitializeAgent()
public override void Initialize()
base.InitializeAgent();
m_HallwaySettings = FindObjectOfType<HallwaySettings>();
m_AgentRb = GetComponent<Rigidbody>();
m_GroundRenderer = ground.GetComponent<Renderer>();

m_AgentRb.AddForce(dirToGo * m_HallwaySettings.agentRunSpeed, ForceMode.VelocityChange);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
AddReward(-1f / maxStep);
MoveAgent(vectorAction);

SetReward(-0.1f);
StartCoroutine(GoalScoredSwapGroundMaterial(m_HallwaySettings.failMaterial, 0.5f));
}
Done();
EndEpisode();
}
}

return new float[] { 0 };
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
var agentOffset = -15f;
var blockOffset = 0f;

9
Project/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs


m_PushBlockSettings = FindObjectOfType<PushBlockSettings>();
}
public override void InitializeAgent()
public override void Initialize()
base.InitializeAgent();
goalDetect = block.GetComponent<GoalDetect>();
goalDetect.agent = this;

AddReward(5f);
// By marking an agent as done AgentReset() will be called automatically.
Done();
EndEpisode();
// Swap ground material for a bit to indicate we scored.
StartCoroutine(GoalScoredSwapGroundMaterial(m_PushBlockSettings.goalScoredMaterial, 0.5f));

/// <summary>
/// Called every step of the engine. Here the agent takes an action.
/// </summary>
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
// Move the agent using the action.
MoveAgent(vectorAction);

/// In the editor, if "Reset On Done" is checked then AgentReset() will be
/// called automatically anytime we mark done = true in an agent script.
/// </summary>
public override void AgentReset()
public override void OnEpisodeBegin()
{
var rotation = Random.Range(0, 4);
var rotationAngle = rotation * 90f;

9
Project/Assets/ML-Agents/Examples/Pyramids/Scripts/PyramidAgent.cs


public GameObject areaSwitch;
public bool useVectorObs;
public override void InitializeAgent()
public override void Initialize()
base.InitializeAgent();
m_AgentRb = GetComponent<Rigidbody>();
m_MyArea = area.GetComponent<PyramidArea>();
m_SwitchLogic = areaSwitch.GetComponent<PyramidSwitch>();

m_AgentRb.AddForce(dirToGo * 2f, ForceMode.VelocityChange);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
AddReward(-1f / maxStep);
MoveAgent(vectorAction);

return new float[] { 0 };
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
var enumerable = Enumerable.Range(0, 9).OrderBy(x => Guid.NewGuid()).Take(9);
var items = enumerable.ToArray();

if (collision.gameObject.CompareTag("goal"))
{
SetReward(2f);
Done();
EndEpisode();
}
}
}

6
Project/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs


/// Collect the rigidbodies of the reacher in order to resue them for
/// observations and actions.
/// </summary>
public override void InitializeAgent()
public override void Initialize()
{
m_RbA = pendulumA.GetComponent<Rigidbody>();
m_RbB = pendulumB.GetComponent<Rigidbody>();

/// <summary>
/// The agent's four actions correspond to torques on each of the two joints.
/// </summary>
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
m_GoalDegree += m_GoalSpeed;
UpdateGoalPosition();

/// <summary>
/// Resets the position and velocity of the agent and the goal.
/// </summary>
public override void AgentReset()
public override void OnEpisodeBegin()
{
pendulumA.transform.position = new Vector3(0f, -4f, 0f) + transform.position;
pendulumA.transform.rotation = Quaternion.Euler(180f, 0f, 0f);

2
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/GroundContact.cs


if (agentDoneOnGroundContact)
{
agent.Done();
agent.EndEpisode();
}
}
}

2
Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/ModelOverrider.cs


var nnModel = GetModelForBehaviorName(name);
Debug.Log($"Overriding behavior {name} for agent with model {nnModel?.name}");
// This might give a null model; that's better because we'll fall back to the Heuristic
m_Agent.GiveModel($"Override_{name}", nnModel);
m_Agent.SetModel($"Override_{name}", nnModel);
}
}

7
Project/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs


BehaviorParameters m_BehaviorParameters;
Vector3 m_Transform;
public override void InitializeAgent()
public override void Initialize()
base.InitializeAgent();
m_BehaviorParameters = gameObject.GetComponent<BehaviorParameters>();
if (m_BehaviorParameters.TeamId == (int)Team.Blue)
{

ForceMode.VelocityChange);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
// Existential penalty for strikers.
AddReward(-1f / 3000f);

}
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
if (team == Team.Purple)
{

2
Project/Assets/ML-Agents/Examples/Soccer/Scripts/SoccerFieldArea.cs


{
ps.agentScript.AddReward(-1);
}
ps.agentScript.Done(); //all agents need to be reset
ps.agentScript.EndEpisode(); //all agents need to be reset
if (goalTextUI)
{

4
Project/Assets/ML-Agents/Examples/Template/Scripts/TemplateAgent.cs


{
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
public override void AgentReset()
public override void OnEpisodeBegin()
{
}
}

4
Project/Assets/ML-Agents/Examples/Tennis/Scripts/HitWall.cs


void Reset()
{
m_AgentA.Done();
m_AgentB.Done();
m_AgentA.EndEpisode();
m_AgentB.EndEpisode();
m_Area.MatchReset();
lastFloorHit = FloorHit.Service;
net = false;

6
Project/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs


const string k_ScoreBoardAName = "ScoreA";
const string k_ScoreBoardBName = "ScoreB";
public override void InitializeAgent()
public override void Initialize()
{
m_AgentRb = GetComponent<Rigidbody>();
m_BallRb = ball.GetComponent<Rigidbody>();

sensor.AddObservation(m_InvertMult * gameObject.transform.rotation.z);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
var moveX = Mathf.Clamp(vectorAction[0], -1f, 1f) * m_InvertMult;
var moveY = Mathf.Clamp(vectorAction[1], -1f, 1f);

return action;
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
m_InvertMult = invertX ? -1f : 1f;

6
Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs


FloatPropertiesChannel m_ResetParams;
public override void InitializeAgent()
public override void Initialize()
{
m_JdController = GetComponent<JointDriveController>();
m_JdController.SetupBodyPart(hips);

}
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
var bpDict = m_JdController.bodyPartsDict;
var i = -1;

/// <summary>
/// Loop over body parts and reset them to initial conditions.
/// </summary>
public override void AgentReset()
public override void OnEpisodeBegin()
{
if (m_DirToTarget != Vector3.zero)
{

16
Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs


Vector3 m_JumpTargetPos;
Vector3 m_JumpStartingPos;
public override void InitializeAgent()
public override void Initialize()
{
m_WallJumpSettings = FindObjectOfType<WallJumpSettings>();
m_Configuration = Random.Range(0, 5);

jumpingTime -= Time.fixedDeltaTime;
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
MoveAgent(vectorAction);
if ((!Physics.Raycast(m_AgentRb.position, Vector3.down, 20))

Done();
EndEpisode();
ResetBlock(m_ShortBlockRb);
StartCoroutine(
GoalScoredSwapGroundMaterial(m_WallJumpSettings.failMaterial, .5f));

if (col.gameObject.CompareTag("goal") && DoGroundCheck(true))
{
SetReward(1f);
Done();
EndEpisode();
StartCoroutine(
GoalScoredSwapGroundMaterial(m_WallJumpSettings.goalScoredMaterial, 2));
}

blockRb.angularVelocity = Vector3.zero;
}
public override void AgentReset()
public override void OnEpisodeBegin()
{
ResetBlock(m_ShortBlockRb);
transform.localPosition = new Vector3(

Academy.Instance.FloatProperties.GetPropertyWithDefault("no_wall_height", 0),
localScale.z);
wall.transform.localScale = localScale;
GiveModel("SmallWallJump", noWallBrain);
SetModel("SmallWallJump", noWallBrain);
}
else if (config == 1)
{

localScale.z);
wall.transform.localScale = localScale;
GiveModel("SmallWallJump", smallWallBrain);
SetModel("SmallWallJump", smallWallBrain);
}
else
{

height,
localScale.z);
wall.transform.localScale = localScale;
GiveModel("BigWallJump", bigWallBrain);
SetModel("BigWallJump", bigWallBrain);
}
}
}

9
com.unity.ml-agents/CHANGELOG.md


- Multi-GPU training and the `--multi-gpu` option has been removed temporarily. (#3345)
- All Sensor related code has been moved to the namespace `MLAgents.Sensors`.
- All SideChannel related code has been moved to the namespace `MLAgents.SideChannels`.
- `BrainParameters` and `SpaceType` have been removed from the public API
- `BehaviorParameters` have been removed from the public API.
- The following methods in the `Agent` class have been deprecated and will be removed in a later release:
- `InitializeAgent()` was renamed to `Initialize()`
- `AgentAction()` was renamed to `OnActionReceived()`
- `AgentReset()` was renamed to `OnEpisodeBegin()`
- `Done()` was renamed to `EndEpisode()`
- `GiveModel()` was renamed to `SetModel()`
### Minor Changes
- Monitor.cs was moved to Examples. (#3372)

- `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
- The `IFloatProperties` interface has been removed.
- Fix #3579.
- Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)
## [0.14.1-preview] - 2020-02-25

56
com.unity.ml-agents/Runtime/Agent.cs


Academy.Instance.AgentForceReset += _AgentReset;
m_Brain = m_PolicyFactory.GeneratePolicy(Heuristic);
ResetData();
InitializeAgent();
Initialize();
InitializeSensors();
}

m_RequestDecision = false;
}
[Obsolete("GiveModel() has been deprecated, use SetModel() instead.")]
public void GiveModel(
string behaviorName,
NNModel model,
InferenceDevice inferenceDevice = InferenceDevice.CPU)
{
SetModel(behaviorName, model, inferenceDevice);
}
/// <summary>
/// Updates the Model for the agent. Any model currently assigned to the
/// agent will be replaced with the provided one. If the arguments are

/// <param name="model"> The model to use for inference.</param>
/// <param name = "inferenceDevice"> Define on what device the model
/// will be run.</param>
public void GiveModel(
public void SetModel(
string behaviorName,
NNModel model,
InferenceDevice inferenceDevice = InferenceDevice.CPU)

TimerStack.Instance.SetGauge(gaugeName, GetCumulativeReward());
}
[Obsolete("Done() has been deprecated, use EndEpisode() instead.")]
public void Done()
{
EndEpisode();
}
public void Done()
public void EndEpisode()
{
NotifyAgentDone(DoneReason.DoneCalled);
_AgentReset();

}
}
[Obsolete("InitializeAgent() has been deprecated, use Initialize() instead.")]
public virtual void InitializeAgent()
{
}
/// <summary>
/// Initializes the agent, called once when the agent is enabled. Can be
/// left empty if there is no special, unique set-up behavior for the

/// One sample use is to store local references to other objects in the
/// scene which would facilitate computing this agents observation.
/// </remarks>
public virtual void InitializeAgent()
public virtual void Initialize()
#pragma warning disable 0618
InitializeAgent();
#pragma warning restore 0618
}
/// <summary>

{
}
[Obsolete("AgentAction() has been deprecated, use OnActionReceived() instead.")]
public virtual void AgentAction(float[] vectorAction)
{
}
/// <summary>
/// Specifies the agent behavior at every step based on the provided
/// action.

/// will be of length 1.
/// </param>
public virtual void AgentAction(float[] vectorAction)
public virtual void OnActionReceived(float[] vectorAction)
{
#pragma warning disable 0618
AgentAction(m_Action.vectorActions);
#pragma warning restore 0618
}
[Obsolete("AgentReset() has been deprecated, use OnEpisodeBegin() instead.")]
public virtual void AgentReset()
{
}

/// episode).
/// </summary>
public virtual void AgentReset()
public virtual void OnEpisodeBegin()
#pragma warning disable 0618
AgentReset();
#pragma warning restore 0618
}
/// <summary>

{
ResetData();
m_StepCount = 0;
AgentReset();
OnEpisodeBegin();
}
/// <summary>

if ((m_RequestAction) && (m_Brain != null))
{
m_RequestAction = false;
AgentAction(m_Action.vectorActions);
OnActionReceived(m_Action.vectorActions);
}
if ((m_StepCount >= maxStep) && (maxStep > 0))

10
com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs


public TestSensor sensor1;
public TestSensor sensor2;
public override void InitializeAgent()
public override void Initialize()
{
initializeAgentCalls += 1;

sensor.AddObservation(0f);
}
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
agentActionCalls += 1;
agentActionCallsSinceLastReset += 1;

public override void AgentReset()
public override void OnEpisodeBegin()
{
agentResetCalls += 1;
collectObservationsCallsSinceLastReset = 0;

// Set agent 1 to done every 11 steps to test behavior
if (i % 11 == 5)
{
agent1.Done();
agent1.EndEpisode();
agent2.Done();
agent2.EndEpisode();
numberAgent2Reset += 1;
agent2StepSinceReset = 0;
}

21
docs/Getting-Started-with-Balance-Ball.md


* **Behavior Parameters** — Every Agent must have a Behavior. The Behavior
determines how an Agent makes decisions. More on Behavior Parameters in
the next section.
* **Max Step** — Defines how many simulation steps can occur before the Agent
decides it is done. In 3D Balance Ball, an Agent restarts after 5000 steps.
* **Max Step** — Defines how many simulation steps can occur before the Agent's
episode ends. In 3D Balance Ball, an Agent restarts after 5000 steps.
Perhaps the more interesting aspect of an agents is the Agent subclass
implementation. When you create an Agent, you must extend the base Agent class.
When you create an Agent, you must extend the base Agent class.
* agent.AgentReset() — Called when the Agent resets, including at the beginning
of a session. The Ball3DAgent class uses the reset function to reset the
* `Agent.OnEpisodeBegin()` — Called when the Agent resets, including at the beginning
of the simulation. The Ball3DAgent class uses the reset function to reset the
* agent.CollectObservations(VectorSensor sensor) — Called every simulation step. Responsible for
* `Agent.CollectObservations(VectorSensor sensor)` — Called every simulation step. Responsible for
* agent.AgentAction() — Called every simulation step. Receives the action chosen
* `Agent.OnActionReceived()` — Called every time the Agent receives an action to take. Receives the action chosen
small change in the agent cube's rotation at each step. The `AgentAction()` function
small change in the agent cube's rotation at each step. The `OnActionReceived()` method
negative reward for dropping the ball. An Agent is also marked as done when it
negative reward for dropping the ball. An Agent's episode is also ended when it
* agent.Heuristic() - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
* `Agent.Heuristic()` - When the `Behavior Type` is set to `Heuristic Only` in the Behavior
Parameters of the Agent, the Agent will use the `Heuristic()` method to generate
the actions of the Agent. As such, the `Heuristic()` method returns an array of
floats. In the case of the Ball 3D Agent, the `Heuristic()` method converts the

34
docs/Learning-Environment-Create-New.md


### Initialization and Resetting the Agent
When the Agent reaches its target, it marks itself done and its Agent reset
function moves the target to a random location. In addition, if the Agent rolls
off the platform, the reset function puts it back onto the floor.
When the Agent reaches its target, its episode ends and the `OnEpisodeBegin()`
method moves the target to a random location. In addition, if the Agent rolls
off the platform, the `OnEpisodeBegin()` method puts it back onto the floor.
To move the target GameObject, we need a reference to its Transform (which
stores a GameObject's position, orientation and scale in the 3D world). To get

}
public Transform Target;
public override void AgentReset()
public override void OnEpisodeBegin()
{
if (this.transform.position.y < 0)
{

}
```
The final part of the Agent code is the `Agent.AgentAction()` method, which
receives the decision from the Brain and assigns the reward.
The final part of the Agent code is the `Agent.OnActionReceived()` method, which
receives the actions from the Brain and assigns the reward.
`AgentAction()` function. The number of elements in this array is determined by
`OnActionReceived()` function. The number of elements in this array is determined by
the `Vector Action` `Space Type` and `Space Size` settings of the
agent's Brain. The RollerAgent uses the continuous vector action space and needs
two continuous control signals from the Brain. Thus, we will set the Brain

### Rewards
Reinforcement learning requires rewards. Assign rewards in the `AgentAction()`
Reinforcement learning requires rewards. Assign rewards in the `OnActionReceived()`
function. The learning algorithm uses the rewards assigned to the Agent during
the simulation and learning process to determine whether it is giving
the Agent the optimal actions. You want to reward an Agent for completing the

The RollerAgent calculates the distance to detect when it reaches the target.
When it does, the code calls the `Agent.SetReward()` method to assign a
reward of 1.0 and marks the agent as finished by calling the `Done()` method
reward of 1.0 and marks the agent as finished by calling the `EndEpisode()` method
on the Agent.
```csharp

if (distanceToTarget < 1.42f)
{
SetReward(1.0f);
Done();
EndEpisode();
Finally, if the Agent falls off the platform, set the Agent to done so that it can reset itself:
Finally, if the Agent falls off the platform, end the episode so that it can reset itself:
Done();
EndEpisode();
### AgentAction()
### OnActionReceived()
`AgentAction()` function looks like:
`OnActionReceived()` function looks like:
public override void AgentAction(float[] vectorAction)
public override void OnActionReceived(float[] vectorAction)
{
// Actions, size = 2
Vector3 controlSignal = Vector3.zero;

if (distanceToTarget < 1.42f)
{
SetReward(1.0f);
Done();
EndEpisode();
Done();
EndEpisode();
}
}

38
docs/Learning-Environment-Design-Agents.md


An action is an instruction from the Policy that the agent carries out. The
action is passed to the Agent as a parameter when the Academy invokes the
agent's `AgentAction()` function. When you specify that the vector action space
agent's `OnActionReceived()` function. When you specify that the vector action space
is **Continuous**, the action parameter passed to the Agent is an array of
control signals with length equal to the `Vector Action Space Size` property.
When you specify a **Discrete** vector action space type, the action parameter

values themselves mean. The training algorithm simply tries different values for
the action list and observes the affect on the accumulated rewards over time and
many training episodes. Thus, the only place actions are defined for an Agent is
in the `AgentAction()` function. You simply specify the type of vector action
space, and, for the continuous vector action space, the number of values, and
then apply the received values appropriately (and consistently) in
`ActionAct()`.
in the `OnActionReceived()` function.
For example, if you designed an agent to move in two dimensions, you could use
either continuous or the discrete vector actions. In the continuous case, you

### Continuous Action Space
When an Agent uses a Policy set to the **Continuous** vector action space, the
action parameter passed to the Agent's `AgentAction()` function is an array with
action parameter passed to the Agent's `OnActionReceived()` function is an array with
length equal to the `Vector Action Space Size` property value.
The individual values in the array have whatever meanings that you ascribe to
them. If you assign an element in the array as the speed of an Agent, for

These control values are applied as torques to the bodies making up the arm:
```csharp
public override void AgentAction(float[] act)
public override void OnActionReceived(float[] act)
{
float torque_x = Mathf.Clamp(act[0], -1, 1) * 100f;
float torque_z = Mathf.Clamp(act[1], -1, 1) * 100f;

### Discrete Action Space
When an Agent uses a **Discrete** vector action space, the
action parameter passed to the Agent's `AgentAction()` function is an array
action parameter passed to the Agent's `OnActionReceived()` function is an array
containing indices. With the discrete vector action space, `Branches` is an
array of integers, each value corresponds to the number of possibilities for
each branch.

agent be able to move __and__ jump concurrently. We define the first branch to
have 5 possible actions (don't move, go left, go right, go backward, go forward)
and the second one to have 2 possible actions (don't jump, jump). The
AgentAction method would look something like:
`OnActionReceived()` method would look something like:
```csharp
// Get the action index for movement

Agent's Heuristic to control the Agent while watching how it accumulates rewards.
Allocate rewards to an Agent by calling the `AddReward()` method in the
`AgentAction()` function. The reward assigned between each decision
`OnActionReceived()` function. The reward assigned between each decision
should be in the range [-1,1]. Values outside this range can lead to
unstable training. The `reward` value is reset to zero when the agent receives a
new decision. If there are multiple calls to `AddReward()` for a single agent

### Examples
You can examine the `AgentAction()` functions defined in the [example
You can examine the `OnActionReceived()` functions defined in the [example
environments](Learning-Environment-Examples.md) to see how those projects
allocate rewards.

if (hitObjects.Where(col => col.gameObject.tag == "goal").ToArray().Length == 1)
{
AddReward(1.0f);
Done();
EndEpisode();
Done();
EndEpisode();
}
```

Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)
{
Done();
EndEpisode();
AddReward(-1f);
}
```

platform:
```csharp
if (IsDone() == false)
{
SetReward(0.1f);
}
SetReward(0.1f);
// When ball falls mark Agent as done and give a negative penalty
// When ball falls mark Agent as finished and give a negative penalty
Done();
EndEpisode();
}
```

Note that all of these environments make use of the `Done()` method, which manually
Note that all of these environments make use of the `EndEpisode()` method, which manually
terminates an episode when a termination condition is reached. This can be
called independently of the `Max Step` property.

33
docs/Learning-Environment-Design.md


Training and simulation proceed in steps orchestrated by the ML-Agents Academy
class. The Academy works with Agent objects in the scene to step
through the simulation. When all Agents in the scene are _done_,
one training episode is finished.
through the simulation.
During training, the external Python training process communicates with the
Academy to run a series of episodes while it collects data and optimizes its

The ML-Agents Academy class orchestrates the agent simulation loop as follows:
1. Calls your Academy's `OnEnvironmentReset` delegate.
2. Calls the `AgentReset()` function for each Agent in the scene.
2. Calls the `OnEpisodeBegin()` function for each Agent in the scene.
5. Calls the `AgentAction()` function for each Agent in the scene, passing in
the action chosen by the Agent's Policy. (This function is not called if the
Agent is done.)
6. Calls the Agent's `AgentReset()` function if the Agent has reached its `Max
Step` count or has otherwise marked itself as `done`.
5. Calls the `OnActionReceived()` function for each Agent in the scene, passing in
the action chosen by the Agent's Policy.
6. Calls the Agent's `OnEpisodeBegin()` function if the Agent has reached its `Max
Step` count or has otherwise marked itself as `EndEpisode()`.
implement the above methods. The `Agent.CollectObservations(VectorSensor sensor)` and
`Agent.AgentAction()` functions are required; the other methods are optional —
whether you need to implement them or not depends on your specific scenario.
implement the above methods whether you need to implement them or not depends on
your specific scenario.
**Note:** The API used by the Python training process to communicate with
and control the Academy during training can be used for other purposes as well.

have appropriate `Behavior Parameters`.
To create an Agent, extend the Agent class and implement the essential
`CollectObservations(VectorSensor sensor)` and `AgentAction()` methods:
`CollectObservations(VectorSensor sensor)` and `OnActionReceived()` methods:
* `AgentAction()` — Carries out the action chosen by the Agent's Policy and
* `OnActionReceived()` — Carries out the action chosen by the Agent's Policy and
assigns a reward to the current state.
Your implementations of these functions determine how the Behavior Parameters

manually set an Agent to done in your `AgentAction()` function when the Agent
has finished (or irrevocably failed) its task by calling the `Done()` function.
manually terminate an Agent episode in your `OnActionReceived()` function when the Agent
has finished (or irrevocably failed) its task by calling the `EndEpisode()` function.
Agent will consider itself done after it has taken that many steps. You can
use the `Agent.AgentReset()` function to prepare the Agent to start again.
Agent will consider the episode over after it has taken that many steps. You can
use the `Agent.OnEpisodeBegin()` function to prepare the Agent to start again.
See [Agents](Learning-Environment-Design-Agents.md) for detailed information
about programming your own Agents.

* The Academy must reset the scene to a valid starting point for each episode of
training.
* A training episode must have a definite end — either using `Max Steps` or by
each Agent setting itself to `done`.
each Agent ending its episode manually with `EndEpisode()`.

14
docs/Migrating.md


* `BrainParameters` and `SpaceType` have been removed from the public API
* `BehaviorParameters` have been removed from the public API.
* `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity.
* The following methods in the `Agent` class have been renamed. The original method names will be removed in a later release:
* `InitializeAgent()` was renamed to `Initialize()`
* `AgentAction()` was renamed to `OnActionReceived()`
* `AgentReset()` was renamed to `OnEpsiodeBegin()`
* `Done()` was renamed to `EndEpisode()`
* `GiveModel()` was renamed to `SetModel()`
* The `IFloatProperties` interface has been removed.
### Steps to Migrate

* If you call `RayPerceptionSensor.PerceiveStatic()` manually, add your inputs to a `RayPerceptionInput`. To get the previous float array output,
iterate through `RayPerceptionOutput.rayOutputs` and call `RayPerceptionOutput.RayOutput.ToFloatArray()`.
* Replace all calls to `Agent.GetStepCount()` with `Agent.StepCount`.
* Replace all calls to `Agent.GetStepCount()` with `Agent.StepCount`
* We strongly recommend replacing the following methods with their new equivalent as they will be removed in a later release:
* `InitializeAgent()` to `Initialize()`
* `AgentAction()` to `OnActionReceived()`
* `AgentReset()` to `OnEpsiodeBegin()`
* `Done()` to `EndEpisode()`
* `GiveModel()` to `SetModel()`
* Replace `IFloatProperties` variables with `FloatPropertiesChannel` variables.
## Migrating from 0.13 to 0.14

236
docs/Python-API.md


allows you to interact directly with a Unity Environment (`mlagents_envs`) and
an entry point to train (`mlagents-learn`) which allows you to train agents in
Unity Environments using our implementations of reinforcement learning or
imitation learning.
imitation learning. This document describes how to use the `mlagents_envs` API.
For information on using `mlagents-learn`, see [here](Training-ML-Agents.md).
You can use the Python Low Level API to interact directly with your learning
environment, and use it to develop new learning algorithms.
The Python Low Level API can be used to interact directly with your Unity learning environment.
As such, it can serve as the basis for developing and evaluating new learning algorithms.
## mlagents_envs

Python-side communication happens through `UnityEnvironment` which is located in
[`environment.py`](../ml-agents-envs/mlagents_envs/environment.py). To load
a Unity environment from a built binary file, put the file in the same directory
as `envs`. For example, if the filename of your Unity environment is 3DBall.app, in python, run:
as `envs`. For example, if the filename of your Unity environment is `3DBall`, in python, run:
```python
from mlagents_envs.environment import UnityEnvironment

`discrete_action_branches = (3,2,)`)
### Modifying the environment from Python
The Environment can be modified by using side channels to send data to the
environment. When creating the environment, pass a list of side channels as
`side_channels` argument to the constructor.
### Communicating additional information with the Environment
In addition to the means of communicating between Unity and python described above,
we also provide methods for sharing agent-agnostic information. These
additional methods are referred to as side channels. ML-Agents includes two ready-made
side channels, described below. It is also possible to create custom side channels to
communicate any additional data between a Unity environment and Python. Instructions for
creating custom side channels can be found [here](Custom-SideChannels.md).
Side channels exist as separate classes which are instantiated, and then passed as list to the `side_channels` argument of the constructor of the `UnityEnvironment` class.
```python
channel = MyChannel()
env = UnityEnvironment(side_channels = [channel])
```
__Note__ : A side channel will only send/receive messages when `env.step` is
__Note__ : A side channel will only send/receive messages when `env.step` or `env.reset()` is
An `EngineConfiguration` will allow you to modify the time scale and graphics quality of the Unity engine.
The `EngineConfiguration` side channel allows you to modify the time-scale, resolution, and graphics quality of the environment. This can be useful for adjusting the environment to perform better during training, or be more interpretable during inference.
* `set_configuration_parameters` with arguments
* width: Defines the width of the display. Default 80.
* height: Defines the height of the display. Default 80.
* quality_level: Defines the quality level of the simulation. Default 1.
* time_scale: Defines the multiplier for the deltatime in the simulation. If set to a higher value, time will pass faster in the simulation but the physics might break. Default 20.
* target_frame_rate: Instructs simulation to try to render at a specified frame rate. Default -1.
* `set_configuration_parameters` which takes the following arguments:
* `width`: Defines the width of the display. Default 80.
* `height`: Defines the height of the display. Default 80.
* `quality_level`: Defines the quality level of the simulation. Default 1.
* `time_scale`: Defines the multiplier for the deltatime in the simulation. If set to a higher value, time will pass faster in the simulation but the physics may perform unpredictably. Default 20.
* `target_frame_rate`: Instructs simulation to try to render at a specified frame rate. Default -1.
For example :
For example, the following code would adjust the time-scale of the simulation to be 2x realtime.
```python
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

```
#### FloatPropertiesChannel
A `FloatPropertiesChannel` will allow you to get and set float properties
in the environment. You can call get_property and set_property on the
side channel to read and write properties.
The `FloatPropertiesChannel` will allow you to get and set pre-defined numerical values in the environment. This can be useful for adjusting environment-specific settings, or for reading non-agent related information from the environment. You can call `get_property` and `set_property` on the side channel to read and write properties.
`FloatPropertiesChannel` has three methods:
* `set_property` Sets a property in the Unity Environment.

channel.set_property("parameter_1", 2.0)
i = env.reset()
readout_value = channel.get_property("parameter_2")
...
```

float property1 = sharedProperties.GetPropertyWithDefault("parameter_1", 0.0f);
```
#### [Advanced] Create your own SideChannel
You can create your own `SideChannel` in C# and Python and use it to communicate data between the two.
##### Unity side
The side channel will have to implement the `SideChannel` abstract class and the following method.
* `OnMessageReceived(byte[] data)` : You must implement this method to specify what the side channel will be doing
with the data received from Python. The data is a `byte[]` argument.
The side channel must also assign a `ChannelId` property in the constructor. The `ChannelId` is a Guid
(or UUID in Python) used to uniquely identify a side channel. This Guid must be the same on C# and Python.
There can only be one side channel of a certain id during communication.
To send a byte array from C# to Python, call the `base.QueueMessageToSend(data)` method inside the side channel.
The `data` argument must be a `byte[]`.
To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
as only argument.
##### Python side
The side channel will have to implement the `SideChannel` abstract class. You must implement :
* `on_message_received(self, data: bytes) -> None` : You must implement this method to specify what the
side channel will be doing with the data received from Unity. The data is a `byte[]` argument.
The side channel must also assign a `channel_id` property in the constructor. The `channel_id` is a UUID
(referred in C# as Guid) used to uniquely identify a side channel. This number must be the same on C# and
Python. There can only be one side channel of a certain id during communication.
To assign the `channel_id` call the abstract class constructor with the appropriate `channel_id` as follows:
```python
super().__init__(my_channel_id)
```
To send a byte array from Python to C#, call the `super().queue_message_to_send(bytes_data)` method inside the
side channel. The `bytes_data` argument must be a `bytes` object.
To register a side channel on the Python side, pass the side channel as argument when creating the
`UnityEnvironment` object. One of the arguments of the constructor (`side_channels`) is a list of side channels.
##### Example implementation
Here is a simple implementation of a Side Channel that will exchange strings between C# and Python
(encoded as ascii).
One the C# side :
Here is an implementation of a `StringLogSideChannel` that will listed to the `UnityEngine.Debug.LogError` calls in
the game :
```csharp
using UnityEngine;
using MLAgents;
using System.Text;
using System;
public class StringLogSideChannel : SideChannel
{
public StringLogSideChannel()
{
ChannelId = new Guid("621f0a70-4f87-11ea-a6bf-784f4387d1f7");
}
public override void OnMessageReceived(byte[] data)
{
var receivedString = Encoding.ASCII.GetString(data);
Debug.Log("From Python : " + receivedString);
}
public void SendDebugStatementToPython(string logString, string stackTrace, LogType type)
{
if (type == LogType.Error)
{
var stringToSend = type.ToString() + ": " + logString + "\n" + stackTrace;
var encodedString = Encoding.ASCII.GetBytes(stringToSend);
base.QueueMessageToSend(encodedString);
}
}
}
```
We also need to register this side channel to the Academy and to the `Application.logMessageReceived` events,
so we write a simple MonoBehavior for this. (Do not forget to attach it to a GameObject in the scene).
```csharp
using UnityEngine;
using MLAgents;
public class RegisterStringLogSideChannel : MonoBehaviour
{
StringLogSideChannel stringChannel;
public void Awake()
{
// We create the Side Channel
stringChannel = new StringLogSideChannel();
// When a Debug.Log message is created, we send it to the stringChannel
Application.logMessageReceived += stringChannel.SendDebugStatementToPython;
// Just in case the Academy has not yet initialized
Academy.Instance.RegisterSideChannel(stringChannel);
}
public void OnDestroy()
{
// De-register the Debug.Log callback
Application.logMessageReceived -= stringChannel.SendDebugStatementToPython;
if (Academy.IsInitialized){
Academy.Instance.UnregisterSideChannel(stringChannel);
}
}
public void Update()
{
// Optional : If the space bar is pressed, raise an error !
if (Input.GetKeyDown(KeyCode.Space))
{
Debug.LogError("This is a fake error. Space bar was pressed in Unity.");
}
}
}
```
And here is the script on the Python side. This script creates a new Side channel type (`StringLogChannel`) and
launches a `UnityEnvironment` with that side channel.
```python
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import SideChannel
import numpy as np
# Create the StringLogChannel class
class StringLogChannel(SideChannel):
def __init__(self) -> None:
super().__init__(uuid.UUID("621f0a70-4f87-11ea-a6bf-784f4387d1f7"))
#### Custom side channels
def on_message_received(self, data: bytes) -> None:
"""
Note :We must implement this method of the SideChannel interface to
receive messages from Unity
"""
# We simply print the data received interpreted as ascii
print(data.decode("ascii"))
def send_string(self, data: str) -> None:
# Convert the string to ascii
bytes_data = data.encode("ascii")
# We call this method to queue the data we want to send
super().queue_message_to_send(bytes_data)
# Create the channel
string_log = StringLogChannel()
# We start the communication with the Unity Editor and pass the string_log side channel as input
env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
env.reset()
string_log.send_string("The environment was reset")
group_name = env.get_agent_groups()[0] # Get the first group_name
for i in range(1000):
step_data = env.get_step_result(group_name)
n_agents = step_data.n_agents() # Get the number of agents
# We send data to Unity : A string with the number of Agent at each
string_log.send_string(
"Step " + str(i) + " occurred with " + str(n_agents) + " agents."
)
env.step() # Move the simulation forward
env.close()
```
Now, if you run this script and press `Play` the Unity Editor when prompted, The console in the Unity Editor will
display a message at every Python step. Additionally, if you press the Space Bar in the Unity Engine, a message will
appear in the terminal.
For information on how to make custom side channels for sending additional data types, see the documentation [here](Custom-SideChannels.md).

1
docs/Readme.md


* [Using the Monitor](Feature-Monitor.md)
* [Using the Video Recorder](https://github.com/Unity-Technologies/video-recorder)
* [Using an Executable Environment](Learning-Environment-Executable.md)
* [Creating Custom Side Channels](Custom-SideChannels.md)
## Training

2
docs/Training-Curriculum-Learning.md


greater than number of thresholds.
Once our curriculum is defined, we have to use the reset parameters we defined
and modify the environment from the Agent's `AgentReset()` function. See
and modify the environment from the Agent's `OnEpisodeBegin()` function. See
[WallJumpAgent.cs](https://github.com/Unity-Technologies/ml-agents/blob/master/Project/Assets/ML-Agents/Examples/WallJump/Scripts/WallJumpAgent.cs)
for an example.

5
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
logger = logging.getLogger("mlagents.trainers")

self.strength = strength
self.stats_name_to_update_name: Dict[str, str] = {}
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
"""
Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
function drawn straight from a Buffer.

)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict

5
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
class CuriosityRewardSignal(RewardSignal):

}
self.has_updated = False
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,

super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare for update and get feed_dict.

3
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


import numpy as np
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.buffer import AgentBuffer
class ExtrinsicRewardSignal(RewardSignal):

param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
return RewardSignalResult(self.strength * env_rews, env_rews)

16
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from mlagents.trainers.policy.tf_policy import TFPolicy
from .model import GAILModel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
class GAILRewardSignal(RewardSignal):

"Policy/GAIL Expert Estimate": "gail_expert_estimate",
}
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,

super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare inputs for update. .

"""
max_num_experiences = min(
len(mini_batch["actions"]), self.demonstration_buffer.num_experiences
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch.items():
mini_batch[key] = element[:max_num_experiences]
# Get batch from demo buffer
# Get batch from demo buffer. Even if demo buffer is smaller, we sample with replacement
len(mini_batch["actions"]), 1
mini_batch.num_experiences, 1
)
feed_dict: Dict[tf.Tensor, Any] = {

4
ml-agents/mlagents/trainers/sac/optimizer.py


return update_stats
def update_reward_signals(
self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
) -> Dict[str, float]:
"""
Only update the reward signals.

feed_dict: Dict[tf.Tensor, Any],
update_dict: Dict[str, tf.Tensor],
stats_needed: Dict[str, str],
reward_signal_minibatches: Mapping[str, Dict],
reward_signal_minibatches: Mapping[str, AgentBuffer],
num_sequences: int,
) -> None:
"""

66
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
gail_dummy_config,
)
@pytest.fixture

optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // dummy_config["sequence_length"],
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_curiosity(
curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811
):
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(curiosity_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(gail_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)

207
docs/Custom-SideChannels.md


# Custom Side Channels
You can create your own side channel in C# and Python and use it to communicate
custom data structures between the two. This can be useful for situations in
which the data to be sent is too complex or structured for the built-in
`FloatPropertiesChannel`, or is not related to any specific agent, and therefore
inappropriate as an agent observation.
## Overview
In order to use a side channel, it must be implemented as both Unity and Python classes.
### Unity side
The side channel will have to implement the `SideChannel` abstract class and the following method.
* `OnMessageReceived(byte[] data)` : You must implement this method to specify what the side channel will be doing
with the data received from Python. The data is a `byte[]` argument.
The side channel must also assign a `ChannelId` property in the constructor. The `ChannelId` is a Guid
(or UUID in Python) used to uniquely identify a side channel. This Guid must be the same on C# and Python.
There can only be one side channel of a certain id during communication.
To send a byte array from C# to Python, call the `base.QueueMessageToSend(data)` method inside the side channel.
The `data` argument must be a `byte[]`.
To register a side channel on the Unity side, call `Academy.Instance.RegisterSideChannel` with the side channel
as only argument.
### Python side
The side channel will have to implement the `SideChannel` abstract class. You must implement :
* `on_message_received(self, data: bytes) -> None` : You must implement this method to specify what the
side channel will be doing with the data received from Unity. The data is a `byte[]` argument.
The side channel must also assign a `channel_id` property in the constructor. The `channel_id` is a UUID
(referred in C# as Guid) used to uniquely identify a side channel. This number must be the same on C# and
Python. There can only be one side channel of a certain id during communication.
To assign the `channel_id` call the abstract class constructor with the appropriate `channel_id` as follows:
```python
super().__init__(my_channel_id)
```
To send a byte array from Python to C#, call the `super().queue_message_to_send(bytes_data)` method inside the
side channel. The `bytes_data` argument must be a `bytes` object.
To register a side channel on the Python side, pass the side channel as argument when creating the
`UnityEnvironment` object. One of the arguments of the constructor (`side_channels`) is a list of side channels.
## Example implementation
Below is a simple implementation of a side channel that will exchange ascii encoded
strings between a Unity environment and Python.
### Example Unity C# code
The first step is to create the `StringLogSideChannel` class within the Unity project.
Here is an implementation of a `StringLogSideChannel` that will listen for messages
from python and print them to the Unity debug log, as well as send error messages
from Unity to python.
```csharp
using UnityEngine;
using MLAgents;
using MLAgents.SideChannels;
using System.Text;
using System;
public class StringLogSideChannel : SideChannel
{
public StringLogSideChannel()
{
ChannelId = new Guid("621f0a70-4f87-11ea-a6bf-784f4387d1f7");
}
public override void OnMessageReceived(byte[] data)
{
var receivedString = Encoding.ASCII.GetString(data);
Debug.Log("From Python : " + receivedString);
}
public void SendDebugStatementToPython(string logString, string stackTrace, LogType type)
{
if (type == LogType.Error)
{
var stringToSend = type.ToString() + ": " + logString + "\n" + stackTrace;
var encodedString = Encoding.ASCII.GetBytes(stringToSend);
base.QueueMessageToSend(encodedString);
}
}
}
```
Once we have defined our custom side channel class, we need to ensure that it is
instantiated and registered. This can typically be done wherever the logic of
the side channel makes sense to be associated, for example on a MonoBehaviour
object that might need to access data from the side channel. Here we show a
simple MonoBehaviour object which instantiates and registeres the new side
channel. If you have not done it already, make sure that the MonoBehaviour
which registers the side channel is attached to a gameobject which will
be live in your Unity scene.
```csharp
using UnityEngine;
using MLAgents;
public class RegisterStringLogSideChannel : MonoBehaviour
{
StringLogSideChannel stringChannel;
public void Awake()
{
// We create the Side Channel
stringChannel = new StringLogSideChannel();
// When a Debug.Log message is created, we send it to the stringChannel
Application.logMessageReceived += stringChannel.SendDebugStatementToPython;
// The channel must be registered with the Academy
Academy.Instance.RegisterSideChannel(stringChannel);
}
public void OnDestroy()
{
// De-register the Debug.Log callback
Application.logMessageReceived -= stringChannel.SendDebugStatementToPython;
if (Academy.IsInitialized){
Academy.Instance.UnregisterSideChannel(stringChannel);
}
}
public void Update()
{
// Optional : If the space bar is pressed, raise an error !
if (Input.GetKeyDown(KeyCode.Space))
{
Debug.LogError("This is a fake error. Space bar was pressed in Unity.");
}
}
}
```
### Example Python code
Now that we have created the necessary Unity C# classes, we can create their Python counterparts.
```python
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import SideChannel
import numpy as np
import uuid
# Create the StringLogChannel class
class StringLogChannel(SideChannel):
def __init__(self) -> None:
super().__init__(uuid.UUID("621f0a70-4f87-11ea-a6bf-784f4387d1f7"))
def on_message_received(self, data: bytes) -> None:
"""
Note :We must implement this method of the SideChannel interface to
receive messages from Unity
"""
# We simply print the data received interpreted as ascii
print(data.decode("ascii"))
def send_string(self, data: str) -> None:
# Convert the string to ascii
bytes_data = data.encode("ascii")
# We call this method to queue the data we want to send
super().queue_message_to_send(bytes_data)
```
We can then instantiate the new side channel,
launch a `UnityEnvironment` with that side channel active, and send a series of
messages to the Unity environment from Python using it.
```python
# Create the channel
string_log = StringLogChannel()
# We start the communication with the Unity Editor and pass the string_log side channel as input
env = UnityEnvironment(base_port=UnityEnvironment.DEFAULT_EDITOR_PORT, side_channels=[string_log])
env.reset()
string_log.send_string("The environment was reset")
group_name = env.get_agent_groups()[0] # Get the first group_name
for i in range(1000):
step_data = env.get_step_result(group_name)
n_agents = step_data.n_agents() # Get the number of agents
# We send data to Unity : A string with the number of Agent at each
string_log.send_string(
"Step " + str(i) + " occurred with " + str(n_agents) + " agents."
)
env.step() # Move the simulation forward
env.close()
```
Now, if you run this script and press `Play` the Unity Editor when prompted,
the console in the Unity Editor will display a message at every Python step.
Additionally, if you press the Space Bar in the Unity Engine, a message will
appear in the terminal.
正在加载...
取消
保存