浏览代码

cleanup around AdamOptimizer (#4333)

* cleanup around AdamOptimizer

* methods to creat Optimizer instances
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
3f44a0bc
共有 4 个文件被更改,包括 14 次插入11 次删除
  1. 3
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  2. 8
      ml-agents/mlagents/trainers/ppo/optimizer.py
  3. 7
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 7
      ml-agents/mlagents/trainers/sac/trainer.py

3
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


self.reward_signals[reward_signal.value].update_dict
)
@classmethod
self, learning_rate: tf.Tensor, name: str = "Adam"
cls, learning_rate: tf.Tensor, name: str = "Adam"
) -> tf.train.Optimizer:
return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)

8
ml-agents/mlagents/trainers/ppo/optimizer.py


self.stream_names = list(self.reward_signals.keys())
self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None

)
def _create_ppo_optimizer_ops(self):
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer.minimize(self.loss)
self.tf_optimizer_op = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer_op.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer_op.minimize(self.loss)
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:

7
ml-agents/mlagents/trainers/ppo/trainer.py


return policy
def create_ppo_optimizer(self) -> PPOOptimizer:
return PPOOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_ppo_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

7
ml-agents/mlagents/trainers/sac/trainer.py


for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> SACOptimizer:
return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_sac_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

正在加载...
取消
保存