Added decision frequency and evaluation metric

4 年前 · 97990611
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallMulti.unity
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scenes/3DBallMulti.unity
    - target: {fileID: 3027918195473112231, guid: 7f24aa5e0e9d54a9b8bb72772633cee7,
        type: 3}
      propertyPath: maxStep
-      value: 20
+      value: 200
+      objectReference: {fileID: 0}
+    - target: {fileID: 3027918195473112231, guid: 7f24aa5e0e9d54a9b8bb72772633cee7,
+        type: 3}
+      propertyPath: rewardType
+      value: 1
+      objectReference: {fileID: 0}
+    - target: {fileID: 3027918195473112231, guid: 7f24aa5e0e9d54a9b8bb72772633cee7,
+        type: 3}
+      propertyPath: numberOfParallel
+      value: 12
+      objectReference: {fileID: 0}
+    - target: {fileID: 3027918195473112231, guid: 7f24aa5e0e9d54a9b8bb72772633cee7,
+        type: 3}
+      propertyPath: decisionFrequency
+      value: 5
      objectReference: {fileID: 0}
    - target: {fileID: 8706416217891658080, guid: 7f24aa5e0e9d54a9b8bb72772633cee7,
        type: 3}
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgentArea.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgentArea.cs
    [Tooltip("Number of maximum steps the agent can take in the environment. ")]
    public int maxStep = 100;
    [Tooltip("Specifies which reward function to use. For all environments")]
-    public Ball3DRewardType rewardType;
+    public Ball3DRewardType rewardType = Ball3DRewardType.Time;
+
+    public int decisionFrequency = 5;
+    public void Awake()
+    {
+        Academy.Instance.OnEnvironmentReset += UpdateEnvs;
+    }
+
+        update_agents();
    }


    {
        foreach (var actor in actorObjs)
        {
-            Ball3DMultiAgent agent = actor.GetComponent<Ball3DMultiAgent>();
+            Ball3DMultiAgent agent = actor.GetComponentInChildren<Ball3DMultiAgent>();    
-            agent.setMaxStep(maxStep);
+            agent.setMaxStep(maxStep * decisionFrequency);
+            DecisionRequester dr = agent.GetComponent<DecisionRequester>();
+            dr.DecisionPeriod = decisionFrequency;
        }
    }
    public void AreaReset()
            }
        }
    }
-    public void FixedUpdate()
+    public void UpdateEnvs()
+        int df = (int)m_ResetParams.GetWithDefault("decisionFreq", decisionFrequency);
        Ball3DRewardType rt = rewardType;
        bool changed = false;
        if (N != numberOfParallel)
            changed = true;
            maxStep = newStep;
        }
+        if (df != decisionFrequency)
+        {
+            changed = true;
+            decisionFrequency = df;
+        }
        if (rtype == 0)
        {
            rt = Ball3DRewardType.Time;
        if (changed)
        {
            AreaReset();
-            update_agents();
+        update_agents();
    }
 }
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DMultiAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DMultiAgent.cs
    Rigidbody m_BallRb;
    EnvironmentParameters m_ResetParams;
    [Tooltip("Specifies which reward function to use. ")]
-    public Ball3DRewardType m_RewardType;
+    public Ball3DRewardType m_RewardType = Ball3DRewardType.Time;
+    StatsRecorder statsRecorder;
+    int stepsInGoal = -1;
+    int timestep = 0;
+
+    float maxdist = 3.54f;  // assumes max distance is 2.5 - -2.5 in each dim. This is an upper bound. 
+
    public override void Initialize()
    {
        m_BallRb = ball.GetComponent<Rigidbody>();
        }
    }

-    // public void FixedUpdate()
-    // {
-    //     MaxStep = stepvalue;
-    // }
-
    public override void OnActionReceived(ActionBuffers actionBuffers)
    {
        var actionZ = 2f * Mathf.Clamp(actionBuffers.ContinuousActions[0], -1f, 1f);
        {
            gameObject.transform.Rotate(new Vector3(1, 0, 0), actionX);
        }
+        bool fell = ((ball.transform.position.y - gameObject.transform.position.y) < -2f ||
+            Mathf.Abs(ball.transform.position.x - gameObject.transform.position.x) > 3f ||
+            Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f);
        float reward = 0.0f;
        if (m_RewardType == Ball3DRewardType.Time)
        {
        {
-            reward = DistanceReward(ball.transform.position, goal.transform.position);
+            reward = DistanceReward(ball.transform.position, goal.transform.position, fell);
-        SetReward(reward);
-        if ((ball.transform.position.y - gameObject.transform.position.y) < -2f ||
-            Mathf.Abs(ball.transform.position.x - gameObject.transform.position.x) > 3f ||
-            Mathf.Abs(ball.transform.position.z - gameObject.transform.position.z) > 3f)
+        AddReward(reward);
+        float dist = Vector3.Distance(ball.transform.position, goal.transform.position);
+        if (dist <= epsilon)
+        {
+            stepsInGoal++;
+        }
+        
+        if (fell)
-
+    void FixedUpdate()
+    {
+        timestep++;
+    }
    public override void OnEpisodeBegin()
    {
        gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);
            + gameObject.transform.position;
        //Reset the parameters when the Agent is reset.
        SetResetParameters();
+        
+        
+        if (stepsInGoal >= 0)
+        {
+            var statsRecorder = Academy.Instance.StatsRecorder;
+            statsRecorder.Add("Environment/EvalMetric", (float)stepsInGoal / (float)MaxStep);
+        }
+        stepsInGoal = 0;
+        timestep = 0;
    }

    public override void Heuristic(in ActionBuffers actionsOut)
        return 0.0f;
    }

-    float DistanceReward(Vector3 ball, Vector3 goal)
+    float DistanceReward(Vector3 ball, Vector3 goal, bool fell)
-        return -dist;
+        float reward = -dist;
+        if (fell)
+        {
+            reward += -maxdist*(MaxStep - timestep);
+        }
+        return reward;
-        float maxdist = 3.54f;  // assumes max distance is 2.5 - -2.5 in each dim. This is an upper bound. 
        float dist = Vector3.Distance(ball, goal);
        //distance between our actual velocity and goal velocity
        dist = Mathf.Clamp(dist, 0, maxdist);
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Materials/Checkers_Gray.mat
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Materials/Checkers_Gray.mat
 Material:
  serializedVersion: 6
  m_ObjectHideFlags: 0
-  m_PrefabParentObject: {fileID: 0}
-  m_PrefabInternal: {fileID: 0}
+  m_CorrespondingSourceObject: {fileID: 0}
+  m_PrefabInstance: {fileID: 0}
+  m_PrefabAsset: {fileID: 0}
  m_Name: Checkers_Gray
  m_Shader: {fileID: 47, guid: 0000000000000000f000000000000000, type: 0}
  m_ShaderKeywords: _GLOSSYREFLECTIONS_OFF _METALLICGLOSSMAP _NORMALMAP _SPECULARHIGHLIGHTS_OFF
        m_Offset: {x: 0, y: 0}
    - _EmissionMap:
        m_Texture: {fileID: 0}
-        m_Scale: {x: 1, y: 1}
+        m_Scale: {x: 3, y: 3}
-        m_Scale: {x: 1, y: 1}
+        m_Scale: {x: 3, y: 3}
        m_Offset: {x: 0, y: 0}
    - _MetallicGlossMap:
        m_Texture: {fileID: 2800000, guid: 0dbde4b748147ad46bb2c40602273db7, type: 3}
    - _GlossyReflections: 0
    - _Metallic: 0
    - _Mode: 0
-    - _OcclusionStrength: 1
+    - _OcclusionStrength: 0.359
    - _Parallax: 0.02
    - _SmoothnessTextureChannel: 0
    - _SpecularHighlights: 0
    m_Colors:
-    - _Color: {r: 1, g: 1, b: 1, a: 1}
+    - _Color: {r: 0.011347434, g: 0.8018868, b: 0.05088379, a: 1}
    - _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
--- a/config/ppo/3DBallMulti_power.yaml
+++ b/config/ppo/3DBallMulti_power.yaml
    trainer_type: ppo
    hyperparameters:
      batch_size: 64
-      buffer_size: 12000
+      buffer_size: 5000
-      lambd: 0.99
+      lambd: 0.9
      num_epoch: 3
      learning_rate_schedule: constant
    network_settings:
        gamma: 0.999
        strength: 1.0
    keep_checkpoints: 5
-    max_steps: 500000
+    max_steps: 5000000
-    summary_freq: 18000
+    summary_freq: 10000
-  maxStep: 20
+  maxStep: 1000
-  numParallel: 8
+  numParallel: 18
+  decisionFreq: 5
  
--- a/config/ppo/3DBallMulti_distance.yaml
+++ b/config/ppo/3DBallMulti_distance.yaml
+behaviors:
+  3DBall:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 5000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.9
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.999
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 5000
+    summary_freq: 10000
+
+environment_parameters:
+  maxStep: 1000
+  rewardType: 1
+  numParallel: 18
+  decisionFreq: 5
+  
--- a/config/ppo/3DBallMulti_time.yaml
+++ b/config/ppo/3DBallMulti_time.yaml
+behaviors:
+  3DBall:
+    trainer_type: ppo
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 5000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.9
+      num_epoch: 3
+      learning_rate_schedule: constant
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 2
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.999
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 5000000
+    time_horizon: 5000
+    summary_freq: 10000
+
+environment_parameters:
+  maxStep: 1000
+  rewardType: 0
+  numParallel: 18
+  decisionFreq: 5
+