Curriculum documentation and improved Area code

7 年前 · 94c20ef0
--- a/python/curricula/push.json
+++ b/python/curricula/push.json
 {
    "measure" : "reward",
-    "thresholds" : [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65],
-    "min_lesson_length" : 3,
+    "thresholds" : [0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
+    "min_lesson_length" : 2,
-        "object_size" : [2.0, 1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1]
+        "goal_size" : [2.5, 2.4, 2.3, 2.2, 2.1, 2.0, 1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0],
+        "block_size": [1.5, 1.4, 1.3, 1.2, 1.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        "x_variation":[1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5]
    }
 }
--- a/python/curricula/wall.json
+++ b/python/curricula/wall.json
 {
    "measure" : "reward",
-    "thresholds" : [0.7, 0.7, 0.7, 0.6, 0.6, 0.6, 0.5, 0.5, 0.5],
-    "min_lesson_length" : 3,
+    "thresholds" : [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+    "min_lesson_length" : 2,
-        "min_wall_height" : [0, 0, 1, 1, 2, 2, 3, 3, 4, 4],
-        "max_wall_height" : [0, 1, 1, 2, 2, 3, 3, 4, 4, 5]
+        "min_wall_height" : [1, 1, 1, 2, 2, 3, 3, 4, 5],
+        "max_wall_height" : [2, 3, 4, 4, 5, 5, 6, 6, 6]
    }
 }
--- a/python/ppo.py
+++ b/python/ppo.py
 Options:
  --help                     Show this message.
  --batch-size=<n>           How many experiences per gradient descent update step [default: 64].
-  --beta=<n>                 Strength of entropy regularization [default: 2e-3].
+  --beta=<n>                 Strength of entropy regularization [default: 2.5e-3].
-  --gamma=<n>                Reward discount rate [default: 0.995].
+  --gamma=<n>                Reward discount rate [default: 0.99].
  --hidden-units=<n>         Number of units in hidden layer [default: 64].
  --keep-checkpoints=<n>     How many model checkpoints to keep [default: 5].
  --lambd=<n>                Lambda parameter for GAE [default: 0.95].
--- a/python/unityagents/curriculum.py
+++ b/python/unityagents/curriculum.py
        if self.data is None or progress is None:
            return {}
        if self.data["signal_smoothing"]:
-            progress = self.smoothing_value * 0.1 + 0.9 * progress
+            progress = self.smoothing_value * 0.25 + 0.75 * progress
            self.smoothing_value = progress
        self.lesson_length += 1
        if self.lesson_number < self.max_lesson_number:
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/AreaAgent.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/AreaAgent.cs
 		return state;
 	}

-    public void MoveAgent(int movement) {
+    public void MoveAgent(float[] act) {
-        if (movement == 1) { directionX = -1; }
-        if (movement == 2) { directionX = 1; }
-        if (movement == 3) { directionZ = -1; }
-        if (movement == 4) { directionZ = 1; }
-        if (movement == 5 && GetComponent<Rigidbody>().velocity.y <= 0) { directionY = 1; }
+
+        if (brain.brainParameters.actionSpaceType == StateType.continuous)
+        {
+            directionX = Mathf.Clamp(act[0], -1f, 1f);
+            directionZ = Mathf.Clamp(act[1], -1f, 1f);
+            directionY = Mathf.Clamp(act[2], -1f, 1f);
+            if (GetComponent<Rigidbody>().velocity.y > 0) { directionY = 0f; }
+        }
+        else {
+            int movement = Mathf.FloorToInt(act[0]);
+            if (movement == 1) { directionX = -1; }
+            if (movement == 2) { directionX = 1; }
+            if (movement == 3) { directionZ = -1; }
+            if (movement == 4) { directionZ = 1; }
+            if (movement == 5 && GetComponent<Rigidbody>().velocity.y <= 0) { directionY = 1; }
+        }

        float edge = 0.499f;
        float rayDepth = 0.51f;

 	public override void AgentStep(float[] act)
 	{
-		reward = -0.01f;
-        int movement = Mathf.FloorToInt(act[0]);
-        MoveAgent(movement);
+		reward = -0.005f;
+        MoveAgent(act);

 		if (gameObject.transform.position.y < 0.0f || Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f || 
            Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushAcademy.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushAcademy.cs

 public class PushAcademy : Academy {

-    public float objectSize;
+    public float goalSize;
+    public float blockSize;
+    public float xVariation;
-        objectSize = (int)resetParameters["object_size"];
+        goalSize = (float)resetParameters["goal_size"];
+        blockSize = (float)resetParameters["block_size"];
+        xVariation = (float)resetParameters["x_variation"];
 	}

 	public override void AcademyStep()
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushAgent.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushAgent.cs
 		state.Add(blockVelocity.y);
 		state.Add(blockVelocity.z);

+        state.Add(block.transform.localScale.x);
+        state.Add(goalHolder.transform.localScale.x);
+
-        reward = -0.01f;
-        int movement = Mathf.FloorToInt(act[0]);
-        MoveAgent(movement);
+        reward = -0.005f;
+        MoveAgent(act);

        if (gameObject.transform.position.y < 0.0f || Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
            Mathf.Abs(gameObject.transform.position.z + 5 - area.transform.position.z) > 8)

 	public override void AgentReset()
 	{
-		transform.position = new Vector3(Random.Range(-3.5f, 3.5f), 1.1f, -8f) + area.transform.position;
+        float xVariation = GameObject.Find("Academy").GetComponent<PushAcademy>().xVariation;
+        transform.position = new Vector3(Random.Range(-xVariation, xVariation), 1.1f, -8f) + area.transform.position;
 		GetComponent<Rigidbody>().velocity = new Vector3(0f, 0f, 0f);

        area.GetComponent<Area>().ResetArea();
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushArea.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushArea.cs

 	public override void ResetArea()
 	{
-		block.transform.position = new Vector3(Random.Range(-2.5f, 2.5f), 1f, Random.Range(-7f, -5f)) + gameObject.transform.position;
-        goalHolder.transform.position = new Vector3(Random.Range(-3.5f, 3.5f), -0.1f, Random.Range(0f, -3f)) + gameObject.transform.position;
+        float goalSize = academy.GetComponent<PushAcademy>().goalSize;
+        float blockSize = academy.GetComponent<PushAcademy>().blockSize;
+        float xVariation = academy.GetComponent<PushAcademy>().xVariation;
-        float size = academy.GetComponent<PushAcademy>().objectSize;
-        size = Random.Range(size * 0.9f, size * 1.1f);
-        block.transform.localScale = new Vector3(size, 1f, size);
-        goalHolder.transform.localScale = new Vector3(size + 1f, 1f, size + 1f);
+        block.transform.position = new Vector3(Random.Range(-xVariation, xVariation), 1f, -6f) + gameObject.transform.position;
+        goalHolder.transform.position = new Vector3(Random.Range(-xVariation, xVariation), -0.1f, -2f) + gameObject.transform.position;
+        goalSize = Random.Range(goalSize * 0.9f, goalSize * 1.1f);
+        blockSize = Random.Range(blockSize * 0.9f, blockSize * 1.1f);
+        block.transform.localScale = new Vector3(blockSize, 1f, blockSize);
+        goalHolder.transform.localScale = new Vector3(goalSize, 1f, goalSize);
 	}

 }
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Wall/WallAgent.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Wall/WallAgent.cs

 	public override void AgentStep(float[] act)
 	{
-        reward = -0.01f;
-        int movement = Mathf.FloorToInt(act[0]);
-        MoveAgent(movement);
+        reward = -0.005f;
+        MoveAgent(act);

        if (gameObject.transform.position.y < 0.0f ||
            Mathf.Abs(gameObject.transform.position.x - area.transform.position.x) > 8f ||
--- a/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Wall/WallArea.cs
+++ b/unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Wall/WallArea.cs
    public override void ResetArea() {
 		int wallHeightMin = (int)academy.GetComponent<WallAcademy>().minWallHeight;
 		int wallHeightMax = (int)academy.GetComponent<WallAcademy>().maxWallHeight;
-		wall.transform.localScale = new Vector3(12f, Random.Range(wallHeightMin, wallHeightMax), 1f);
+		wall.transform.localScale = new Vector3(12f, Random.Range(wallHeightMin, wallHeightMax) - 0.1f, 1f);
        block.transform.position = new Vector3(Random.Range(-3.5f, 3.5f), 1f, Random.Range(-4f, -8f)) + gameObject.transform.position;
 		goalHolder.transform.position = new Vector3(Random.Range(-3.5f, 3.5f), 0.25f, 0f) + gameObject.transform.position;
 	}
--- a/docs/curriculum.md
+++ b/docs/curriculum.md
+# Curriculum Learning
+
+## Background
+
+Curriculum learning is a way of training a machine learning model where more difficult 
+aspects of a problem are gradually introduced in such a way that the model is always 
+optimally challenged. Here is a link to the original paper which introduces the ideal 
+formally. More generally, this idea has been around much longer, for it is how we humans 
+typically learn. If you imagine any childhood primary school education, there is an 
+ordering of classes and topics. Arithmetic is taught before algebra, for example. 
+Likewise, algebra is taught before calculus. The skills and knowledge learned in the 
+earlier subjects provide a scaffolding for later lessons. The same principle can be 
+applied to machine learning, where training on easier tasks can provide a scaffolding 
+for harder tasks in the future. 
+
+[Math](../images/math.png)
+_Example of a mathematics curriculum. Lessons progress from simpler topics to more 
+complex ones, with each building on the last._
+
+When we think about how Reinforcement Learning actually works, the primary learning 
+signal is a scalar reward received occasionally throughout training. In more complex 
+or difficult tasks, this reward can often be sparse, and rarely achieved. For example, 
+imagine a task in which an agent needs to scale a wall to arrive at a goal. The starting 
+point when training an agent to accomplish this task will be a random policy. That 
+starting policy will have the agent running in circles, and will likely never, or very 
+rarely scale the wall properly to the achieve the reward. If we start with a simpler 
+task, such as moving toward an unobstructed goal, then the agent can easily learn to 
+accomplish the task. From there, we can slowly add to the difficulty of the task by 
+increasing the size of the wall, until the agent can complete the initially 
+near-impossible task of scaling the wall. We are including just such an environment with 
+ML-Agents 0.2, called Wall Area.
+
+[Wall](../images/curriculum.png)
+_Demonstration of a curriculum training scenario in which a progressively taller wall 
+obstructs the path to the goal._
+ 
+To see this in action, observe the two learning curves below. Each displays the reward 
+over time for an agent trained using PPO with the same set of training hyperparameters. 
+The difference is that the agent on the left was trained using the full-height wall 
+version of the task, and the right agent was trained using the curriculum version of 
+the task. As you can see, without using curriculum learning the agent has a lot of 
+difficulty. We think that by using well-crafted curricula, agents trained using 
+reinforcement learning will be able to accomplish tasks otherwise much more difficult. 
+ 
+[INSERT TRAINING CURVES]
+
+## How-To
+ 
+So how does it work? In order to define a curriculum, the first step is to decide which 
+parameters of the environment will vary. In the case of the Wall Area environment, what 
+varies is the height of the wall. We can define this as a reset parameter in the Academy 
+object of our scene, and by doing so it becomes adjustable via the Python API. Rather 
+than adjusting it by hand, we then create a simple JSON file which describes the 
+structure of the curriculum. Within it we can set at what points in the training process 
+our wall height will change, either based on the percentage of training steps which have 
+taken place, or what the average reward the agent has received in the recent past is. 
+Once these are in place, we simply launch ppo.py using the `–curriculum-file` flag to 
+point to the JSON file, and PPO we will train using Curriculum Learning. Of course we can 
+then keep track of the current lesson and progress via TensorBoard.
+
+
+```
+{
+    "measure" : "reward",
+    "thresholds" : [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+    "min_lesson_length" : 2,
+    "signal_smoothing" : true, 
+    "parameters" : 
+    {
+        "min_wall_height" : [1, 1, 1, 2, 2, 3, 3, 4, 5],
+        "max_wall_height" : [2, 3, 4, 4, 5, 5, 6, 6, 6]
+    }
+}
+```
+
+* `measure` - What to measure learning progress, and advancement in lessons by.
+    * `reward` - Uses a measure received reward. 
+    * `progress` - Uses ratio of steps/max_steps.
+* `thresholds` (float array) - Points in value of `measure` where lesson should be increased.
+* `min_lesson_length` (int) - How many times the progress measure should be reported before 
+incrementing the lesson.
+* `signal_smoothing` (true/false) - Whether to weight the current progress measure by previous values.
+    * If `true`, weighting will be 0.75 (new) 0.25 (old).
+* `parameters` (dictionary of key:string, value:float array) - Corresponds to academy reset parameters to control. Length of each array
+should be one greater than number of thresholds.
--- a/images/curriculum.png
+++ b/images/curriculum.png
--- a/images/math.png
+++ b/images/math.png