浏览代码

Fix cumulative reward (Unity) and Nan reward (python) bugs

/tag-0.2.0
Arthur Juliani 7 年前
当前提交
b56259f6
共有 8 个文件被更改,包括 38 次插入31 次删除
  1. 1
      python/ppo.py
  2. 20
      python/ppo/trainer.py
  3. 2
      python/unityagents/environment.py
  4. 27
      unity-environment/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab
  5. 3
      unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  6. 13
      unity-environment/Assets/ML-Agents/Scripts/Agent.cs
  7. 1
      unity-environment/Assets/ML-Agents/Scripts/Brain.cs
  8. 2
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs

1
python/ppo.py


while steps <= max_steps or not train_model:
if env.global_done:
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
trainer.reset_buffers(info, total=True)
# Decide and take an action
new_info = trainer.take_action(info, env, brain_name, steps)
info = new_info

20
python/ppo/trainer.py


'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
self.stats = stats
self.is_training = training
self.training_buffer = vectorize_history(empty_local_history({}))
self.history_dict = empty_all_history(info)
self.reset_buffers(info, total=True)
self.is_continuous = is_continuous
self.use_observations = use_observations

history['cumulative_reward'] = 0
history['episode_steps'] = 0
def reset_buffers(self, brain_info=None, total=False):
self.training_buffer = vectorize_history(empty_local_history({}))
if not total:
for key in self.history_dict:
self.history_dict[key] = empty_local_history(self.history_dict[key])
else:
self.history_dict = empty_all_history(agent_info=brain_info)
def update_model(self, batch_size, num_epoch):
"""
Uses training_buffer to update model.

total_p += p_loss
self.stats['value_loss'].append(total_v)
self.stats['policy_loss'].append(total_p)
self.training_buffer = vectorize_history(empty_local_history({}))
for key in self.history_dict:
self.history_dict[key] = empty_local_history(self.history_dict[key])
self.reset_buffers()
def write_summary(self, summary_writer, steps, lesson_number):
"""

"""
print("Mean Reward: {0}".format(np.mean(self.stats['cumulative_reward'])))
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
print("Mean Reward: {0}".format(mean_reward))
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:

2
python/unityagents/environment.py


logger.info("\nLesson changed. Now in Lesson {0} : \t{1}"
.format(self._curriculum.get_lesson_number(),
', '.join([str(x)+' -> '+str(config[x]) for x in config])))
else:
elif config != {}:
logger.info("\nAcademy Reset. In Lesson {0} : \t{1}"
.format(self._curriculum.get_lesson_number(),
', '.join([str(x)+' -> '+str(config[x]) for x in config])))

27
unity-environment/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab


- component: {fileID: 4836354168995630}
- component: {fileID: 33827327769986516}
- component: {fileID: 23913365403597130}
- component: {fileID: 114223302162522712}
m_Layer: 0
m_Name: GoalOn
m_TagString: Untagged

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Enabled: 1
m_CastShadows: 1
m_ReceiveShadows: 1
m_DynamicOccludee: 1
m_MotionVectors: 1
m_LightProbeUsage: 1
m_ReflectionProbeUsage: 1

m_PreserveUVs: 1
m_IgnoreNormalsForChartDetection: 0
m_ImportantGI: 0
m_StitchLightmapSeams: 0
m_SelectedEditorRenderState: 3
m_MinimumChartSize: 4
m_AutoUVMaxDistance: 0.5

m_Interpolate: 0
m_Constraints: 0
m_CollisionDetection: 0
--- !u!114 &114223302162522712
MonoBehaviour:
m_ObjectHideFlags: 1
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 100100000}
m_GameObject: {fileID: 1065277484498824}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: ee7cb421851cf4e088ddbd1254584c68, type: 3}
m_Name:
m_EditorClassIdentifier:
agent: {fileID: 1395682910799436}
hand: {fileID: 1654288206095398}
goalOn: {fileID: 0}
--- !u!114 &114928491800121992
MonoBehaviour:
m_ObjectHideFlags: 1

3
unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs


torque_z = Mathf.Clamp(act[3], -1, 1) * 100f;
rbB.AddTorque(new Vector3(torque_x, 0f, torque_z));
//reward -= 0.001f;
//reward += (Mathf.Max(-4f, pendulumB.transform.position.y) + 4f) / 200f;
}
void UpdateGoalPosition() {

goal.transform.position = new Vector3(goalY, -1f, goalX) + transform.position;
}

13
unity-environment/Assets/ML-Agents/Scripts/Agent.cs


* If AgentMonitor is attached to the Agent, this value will be displayed.*/
[HideInInspector]
public float CummulativeReward;
/**< \brief Do not modify: This keeps track of the cummulative reward.*/
public float CumulativeReward;
/**< \brief Do not modify: This keeps track of the cumulative reward.*/
[HideInInspector]
public int stepCounter;

public void Reset()
{
memory = new float[brain.brainParameters.memorySize];
CummulativeReward = 0f;
CumulativeReward = 0f;
stepCounter = 0;
AgentReset();
}

{
return reward;
}
public void SetCumulativeReward()
{
CumulativeReward += reward;
//Debug.Log(reward);
}
/// Do not modify : Is used by the brain to collect done.

{
AgentStep(agentStoredAction);
stepCounter += 1;
CummulativeReward += reward;
if ((stepCounter > maxStep) && (maxStep > 0))
{
done = true;

1
unity-environment/Assets/ML-Agents/Scripts/Brain.cs


Dictionary<int, List<float>> result = new Dictionary<int, List<float>>();
foreach (KeyValuePair<int, Agent> idAgent in agents)
{
idAgent.Value.SetCumulativeReward();
List<float> states = idAgent.Value.CollectState();
if ((states.Count != brainParameters.stateSize) && (brainParameters.stateSpaceType == StateType.continuous))
{

2
unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs


{
if (brain.gameObject.transform.parent.gameObject.GetComponent<Academy>().communicator == null)
{
coord = null;
coord = null;
}
else if (brain.gameObject.transform.parent.gameObject.GetComponent<Academy>().communicator is ExternalCommunicator)
{

正在加载...
取消
保存