Add reward manager and hurryUpReward

4 年前 · decf9a0a
--- a/Project/Assets/ML-Agents/Examples/Walker/Prefabs/DynamicPlatformWalker.prefab
+++ b/Project/Assets/ML-Agents/Examples/Walker/Prefabs/DynamicPlatformWalker.prefab
  m_SortingLayerID: 0
  m_SortingLayer: 0
  m_SortingOrder: 0
+--- !u!114 &758428434940870733
+MonoBehaviour:
+  m_ObjectHideFlags: 0
+  m_CorrespondingSourceObject: {fileID: 0}
+  m_PrefabInstance: {fileID: 0}
+  m_PrefabAsset: {fileID: 0}
+  m_GameObject: {fileID: 6065910098925129117}
+  m_Enabled: 1
+  m_EditorHideFlags: 0
+  m_Script: {fileID: 11500000, guid: 2d2b3caecf069467ebf3a650d8ee401e, type: 3}
+  m_Name: 
+  m_EditorClassIdentifier: 
+  rewardsList:
+  - rewardKey: matchSpeed
+    rewardScalar: 0.01
+    rewardThisStep: 0
+    cumulativeThisEpisode: 0
+    cumulativeThisSession: 0
+    maxRewardThisSession: 0
+  - rewardKey: lookAtTarget
+    rewardScalar: 0.01
+    rewardThisStep: 0
+    cumulativeThisEpisode: 0
+    cumulativeThisSession: 0
+    maxRewardThisSession: 0
+  - rewardKey: headHeightOverFeet
+    rewardScalar: 0.005
+    rewardThisStep: 0
+    cumulativeThisEpisode: 0
+    cumulativeThisSession: 0
+    maxRewardThisSession: 0
+  - rewardKey: hurryUp
+    rewardScalar: 1
+    rewardThisStep: 0
+    cumulativeThisEpisode: 0
+    cumulativeThisSession: 0
+    maxRewardThisSession: 0
+  maxSteps: 0
 --- !u!1001 &6359877978260855390
 PrefabInstance:
  m_ObjectHideFlags: 0
      value: 
      objectReference: {fileID: 11400000, guid: e785133c5b0ac461588106642550d1b3,
        type: 3}
+    - target: {fileID: 895268871377934297, guid: 765582efd9dda46ed98564603316353f,
+        type: 3}
+      propertyPath: m_BrainParameters.VectorObservationSize
+      value: 237
+      objectReference: {fileID: 0}
    - target: {fileID: 895268871377934298, guid: 765582efd9dda46ed98564603316353f,
        type: 3}
      propertyPath: m_LocalPosition.x
      propertyPath: target
      value: 
      objectReference: {fileID: 5064725739247198300}
+    - target: {fileID: 7408209125961349353, guid: 765582efd9dda46ed98564603316353f,
+        type: 3}
+      propertyPath: rewardManager
+      value: 
+      objectReference: {fileID: 758428434940870733}
+--- !u!1 &6065910098925129117 stripped
+GameObject:
+  m_CorrespondingSourceObject: {fileID: 895268871377934275, guid: 765582efd9dda46ed98564603316353f,
+    type: 3}
+  m_PrefabInstance: {fileID: 6359877978260855390}
+  m_PrefabAsset: {fileID: 0}
 --- !u!4 &6065910098925129092 stripped
 Transform:
  m_CorrespondingSourceObject: {fileID: 895268871377934298, guid: 765582efd9dda46ed98564603316353f,
    type: 3}
  m_PrefabInstance: {fileID: 6359877978260855390}
  m_PrefabAsset: {fileID: 0}
-  m_GameObject: {fileID: 0}
+  m_GameObject: {fileID: 6065910098925129117}
  m_Enabled: 1
  m_EditorHideFlags: 0
  m_Script: {fileID: 11500000, guid: ccb0f85f0009540d7ad997952e2aed7b, type: 3}
--- a/Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerDynamic.unity
+++ b/Project/Assets/ML-Agents/Examples/Walker/Scenes/WalkerDynamic.unity
    debug:
      m_Flags: 0
  m_NavMeshData: {fileID: 0}
+--- !u!114 &79411373 stripped
+MonoBehaviour:
+  m_CorrespondingSourceObject: {fileID: 758428434940870733, guid: f51e8260728fd4c8fa87bcda9d0e2027,
+    type: 3}
+  m_PrefabInstance: {fileID: 1615064471}
+  m_PrefabAsset: {fileID: 0}
+  m_GameObject: {fileID: 0}
+  m_Enabled: 1
+  m_EditorHideFlags: 0
+  m_Script: {fileID: 11500000, guid: 2d2b3caecf069467ebf3a650d8ee401e, type: 3}
+  m_Name: 
+  m_EditorClassIdentifier: 
 --- !u!1001 &193531851
 PrefabInstance:
  m_ObjectHideFlags: 0
        type: 3}
      propertyPath: rewardManager
      value: 
-      objectReference: {fileID: 0}
+      objectReference: {fileID: 79411373}
    - target: {fileID: 4712600297668500197, guid: f51e8260728fd4c8fa87bcda9d0e2027,
        type: 3}
      propertyPath: m_Name
--- a/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/Walker/Scripts/WalkerAgent.cs

 public class WalkerAgent : Agent
 {
-    public float maximumWalkingSpeed = 999; //The max walk velocity magnitude an agent will be rewarded for
+    [Range(0, 10)]
+    public float walkingSpeed = 10; //The max walk velocity magnitude an agent will be rewarded for
+
-//    Quaternion m_WalkDirLookRot; //Will hold the rotation to our target

    [Header("Target To Walk Towards")] [Space(10)]
    public TargetController target; //Target the agent will walk towards.

        orientationCube.UpdateOrientation(hips, target.transform);

+        rewardManager.ResetEpisodeRewards();
+        
+        walkingSpeed = Random.Range(0.0f, 10.0f); //Random Walk Speed
+
        SetResetParameters();
    }

    /// </summary>
    public override void CollectObservations(VectorSensor sensor)
    {
+        
+        sensor.AddObservation(walkingSpeed);
        sensor.AddObservation(Quaternion.FromToRotation(hips.forward, orientationCube.transform.forward));
        sensor.AddObservation(Quaternion.FromToRotation(head.forward, orientationCube.transform.forward));


    void FixedUpdate()
    {
-       UpdateRewards();
+        UpdateRewards();
-    public float headFacingDot;
-    public float hipsFacingDot;
-    public float headHeightOverFeetReward;
+    public float lookAtTargetReward; //reward for looking at the target
+    public float matchSpeedReward; //reward for matching the desired walking speed.
+    public float headHeightOverFeetReward; //reward for standing up straight-ish
+    public float hurryUpReward = -1; //don't waste time
-        headFacingDot = Vector3.Dot(cubeForward, head.forward);
-        hipsFacingDot = Vector3.Dot(cubeForward, hips.forward);
-        // a. Velocity alignment with goal direction.
-        var moveTowardsTargetReward =  Mathf.Exp(-0.1f * (orientationCube.transform.forward * maximumWalkingSpeed - m_JdController.bodyPartsDict[hips].rb.velocity).sqrMagnitude);
+        // a. Match target speed
+        //This reward will approach 1 if it matches and approach zero as it deviates
+        matchSpeedReward =
+            Mathf.Exp(-0.1f * (orientationCube.transform.forward * walkingSpeed -
+                               m_JdController.bodyPartsDict[hips].rb.velocity).sqrMagnitude);
-        var lookAtTargetReward = Vector3.Dot(cubeForward, head.forward);
+        lookAtTargetReward = Vector3.Dot(cubeForward, head.forward);
-        headHeightOverFeetReward = ((head.position.y - footL.position.y) + (head.position.y - footR.position.y)/10); //Should normalize to ~1
+        headHeightOverFeetReward =
+            ((head.position.y - footL.position.y) + (head.position.y - footR.position.y) / 10); //Should normalize to ~1
-        rewardManager.UpdateReward("moveTowardsTarget", moveTowardsTargetReward);
+
+        rewardManager.UpdateReward("matchSpeed", matchSpeedReward);
-//        rewardManager.UpdateReward("moveTowardsTargetReward", +0.02f * moveTowardsTargetReward);
-//        rewardManager.UpdateReward("lookAtTargetReward", +0.01f * lookAtTargetReward);
-//        rewardManager.UpdateReward("headHeightOverFeetReward", +0.01f * headHeightOverFeetReward);
+        rewardManager.UpdateReward("hurryUp", hurryUpReward); 
+    
 //    void FixedUpdate()
 //    {
 //        var cubeForward = orientationCube.transform.forward;
    {
        SetTorsoMass();
    }
-}
+}
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/RewardManager.cs
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/RewardManager.cs
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using Unity.MLAgents;
+using UnityEngine;
+
+public class RewardManager : MonoBehaviour
+{
+    [Serializable]
+    public class Reward
+    {
+        public string rewardKey;
+//        [Range(.01f, .05f)]
+        public float rewardScalar = .01f;
+//        public float rewardScalar;
+        public float rewardThisStep;
+        public float cumulativeThisEpisode;
+        public float cumulativeThisSession;
+
+        public float maxRewardThisSession;
+//        public Reward(string k)
+//        public Reward()
+//        {
+////            rewardKey = k;
+//            rewardScalar = .01f;
+//        }
+    }
+
+    private Agent m_thisAgent;
+    public List<Reward> rewardsList = new List<Reward>();
+    public Dictionary<string, Reward> rewardsDict = new Dictionary<string, Reward>();
+    public float maxSteps;
+    private void OnEnable()
+//    private void Awake()
+    {
+        m_thisAgent = GetComponent<Agent>();
+        maxSteps = m_thisAgent.MaxStep;
+        foreach (var item in rewardsList)
+        {
+            if (rewardsDict.ContainsKey(item.rewardKey)) return; //don't need to add
+            rewardsDict.Add(item.rewardKey, item);
+        }
+    }
+
+//    public void AddReward(Reward r)
+//    {
+//        if (rewardsDict.ContainsKey(r.rewardKey)) return; //don't need to add
+//        rewardsDict.Add(r.rewardKey, r);
+//    }
+    
+//    public void AddReward(string rewardKey)
+//    {
+//        if (rewardsDict.ContainsKey(rewardKey)) return; //don't need to add
+//        Reward newReward = new Reward(rewardKey);
+//        rewardsDict.Add(rewardKey, newReward);
+//        rewardsList.Add(newReward);
+//    }
+
+    //Add new rewards
+    public void UpdateReward(string key, float rawVal)
+    {
+        float val = rawVal * rewardsDict[key].rewardScalar;
+        rewardsDict[key].maxRewardThisSession =1/maxSteps;
+        rewardsDict[key].rewardThisStep = val;
+        rewardsDict[key].cumulativeThisEpisode += val;
+        rewardsDict[key].cumulativeThisSession += val;
+        m_thisAgent.AddReward(val);
+    }
+
+//    //Add new rewards
+//    public void UpdateReward(string key, float val)
+//    {
+//        rewardsDict[key].rewardThisStep = val;
+//        rewardsDict[key].cumulativeThisEpisode += val;
+//        rewardsDict[key].cumulativeThisSession += val;
+//        m_thisAgent.AddReward(val);
+//    }
+
+    //Resets cumulative episode reward
+    public void ResetEpisodeRewards()
+    {
+        foreach (var item in rewardsDict)
+        {
+            item.Value.rewardThisStep = 0;
+            item.Value.cumulativeThisEpisode = 0;
+        }
+    }
+    
+    // Start is called before the first frame update
+    void Start()
+    {
+        
+    }
+
+    // Update is called once per frame
+    void Update()
+    {
+        
+    }
+}
--- a/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/RewardManager.cs.meta
+++ b/Project/Assets/ML-Agents/Examples/SharedAssets/Scripts/RewardManager.cs.meta
+fileFormatVersion: 2
+guid: 2d2b3caecf069467ebf3a650d8ee401e
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: