浏览代码

Remove unused "last reward" logic, TF nodes

At each step, an unused `last_reward` variable in the TF graph is
updated in our PPO trainer.  There are also related unused methods
in various places in the codebase.  This change removes them.
/develop-generalizationTraining-TrainerController
Jonathan Harper 6 年前
当前提交
177ee5b8
共有 7 个文件被更改,包括 9 次插入62 次删除
  1. 15
      ml-agents/mlagents/trainers/bc/trainer.py
  2. 13
      ml-agents/mlagents/trainers/ppo/models.py
  3. 16
      ml-agents/mlagents/trainers/ppo/policy.py
  4. 7
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 2
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  6. 16
      ml-agents/mlagents/trainers/trainer.py
  7. 2
      ml-agents/mlagents/trainers/trainer_controller.py

15
ml-agents/mlagents/trainers/bc/trainer.py


"""
return self.policy.get_current_step()
@property
def get_last_reward(self):
"""
Returns the last reward the trainer has had
:return: the new last reward
def increment_step(self):
if len(self.stats["Environment/Cumulative Reward"]) > 0:
return np.mean(self.stats["Environment/Cumulative Reward"])
else:
return 0
def increment_step_and_update_last_reward(self):
"""
Increment the step count of the trainer and Updates the last reward
Increment the step count of the trainer
"""
self.policy.increment_step()
return

13
ml-agents/mlagents/trainers/ppo/models.py


)
if num_layers < 1:
num_layers = 1
self.last_reward, self.new_reward, self.update_reward = (
self.create_reward_encoder()
)
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy

lr,
max_step,
)
@staticmethod
def create_reward_encoder():
"""Creates TF ops to track and increment recent average cumulative reward."""
last_reward = tf.Variable(
0, name="last_reward", trainable=False, dtype=tf.float32
)
new_reward = tf.placeholder(shape=[], dtype=tf.float32, name="new_reward")
update_reward = tf.assign(last_reward, new_reward)
return last_reward, new_reward, update_reward
def create_losses(
self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step

16
ml-agents/mlagents/trainers/ppo/policy.py


value=mean_values,
outputs=run_out,
)
def get_last_reward(self):
"""
Returns the last reward the trainer has had
:return: the new last reward
"""
return self.sess.run(self.model.last_reward)
def update_reward(self, new_reward):
"""
Updates reward value for policy.
:param new_reward: New reward to save.
"""
self.sess.run(
self.model.update_reward, feed_dict={self.model.new_reward: new_reward}
)

7
ml-agents/mlagents/trainers/ppo/trainer.py


"""
return self._reward_buffer
def increment_step_and_update_last_reward(self):
def increment_step(self):
Increment the step count of the trainer and Updates the last reward
Increment the step count of the trainer
if self.stats["Environment/Cumulative Reward"]:
mean_reward = np.mean(self.stats["Environment/Cumulative Reward"])
self.policy.update_reward(mean_reward)
self.policy.increment_step()
self.step = self.policy.get_current_step()

2
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


)
trainer_mock.update_policy.assert_called_once()
trainer_mock.write_summary.assert_called_once()
trainer_mock.increment_step_and_update_last_reward.assert_called_once()
trainer_mock.increment_step.assert_called_once()

16
ml-agents/mlagents/trainers/trainer.py


"""
raise UnityTrainerException("The get_step property was not implemented.")
@property
def get_last_reward(self):
"""
Returns the last reward the trainer has had
:return: the new last reward
"""
raise UnityTrainerException("The get_last_reward property was not implemented.")
def increment_step_and_update_last_reward(self):
def increment_step(self):
Increment the step count of the trainer and updates the last reward
Increment the step count of the trainer
raise UnityTrainerException(
"The increment_step_and_update_last_reward method was not implemented."
)
raise UnityTrainerException("The increment_step method was not implemented.")
def get_action(self, curr_info: BrainInfo) -> ActionInfo:
"""

2
ml-agents/mlagents/trainers/trainer_controller.py


else:
trainer.write_summary(self.global_step, delta_train_start)
if self.train_model and trainer.get_step <= trainer.get_max_steps:
trainer.increment_step_and_update_last_reward()
trainer.increment_step()
return new_info
正在加载...
取消
保存