浏览代码

Merge branch 'master' into global-variables

/MLA-1734-demo-provider
Anupam Bhatnagar 4 年前
当前提交
a5cc4d03
共有 5 个文件被更改,包括 28 次插入48 次删除
  1. 3
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  2. 8
      ml-agents/mlagents/trainers/ppo/optimizer.py
  3. 7
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 7
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 51
      ml-agents/mlagents/trainers/stats.py

3
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


self.reward_signals[reward_signal.value].update_dict
)
@classmethod
self, learning_rate: tf.Tensor, name: str = "Adam"
cls, learning_rate: tf.Tensor, name: str = "Adam"
) -> tf.train.Optimizer:
return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)

8
ml-agents/mlagents/trainers/ppo/optimizer.py


self.stream_names = list(self.reward_signals.keys())
self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None

)
def _create_ppo_optimizer_ops(self):
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer.minimize(self.loss)
self.tf_optimizer_op = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer_op.compute_gradients(self.loss)
self.update_batch = self.tf_optimizer_op.minimize(self.loss)
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:

7
ml-agents/mlagents/trainers/ppo/trainer.py


return policy
def create_ppo_optimizer(self) -> PPOOptimizer:
return PPOOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_ppo_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

7
ml-agents/mlagents/trainers/sac/trainer.py


for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> SACOptimizer:
return SACOptimizer(cast(TFPolicy, self.policy), self.trainer_settings)
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
self.optimizer = self.create_sac_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

51
ml-agents/mlagents/trainers/stats.py


if stats_summary.mean > 0.0:
is_training = "Training."
elapsed_time = time.time() - self.training_start_time
log_info: List[str] = [category]
log_info.append(f"Step: {step}")
log_info.append(f"Time Elapsed: {elapsed_time:0.3f} s")
logger.info(
"Rank: {}."
"{}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
self.rank(),
category,
step,
time.time() - self.training_start_time,
stats_summary.mean,
stats_summary.std,
is_training,
)
)
else:
logger.info(
"{}: Step: {}. "
"Time Elapsed: {:0.3f} s "
"Mean "
"Reward: {:0.3f}"
". Std of Reward: {:0.3f}. {}".format(
category,
step,
time.time() - self.training_start_time,
stats_summary.mean,
stats_summary.std,
is_training,
)
)
log_info.append(f"Rank: {self.rank}")
log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
log_info.append(is_training)
logger.info(f"{category} ELO: {elo_stats.mean:0.3f}. ")
log_info.append(f"ELO: {elo_stats.mean:0.3f}")
logger.info(
"{}: Step: {}. No episode was completed since last summary. {}".format(
category, step, is_training
)
)
log_info.append("No episode was completed since last summary")
log_info.append(is_training)
logger.info(". ".join(log_info))
def add_property(
self, category: str, property_type: StatsPropertyType, value: Any

正在加载...
取消
保存