浏览代码

Fix get_value_estimate and buffer append (#2276)

Fixes shuffling issue with newer versions of numpy (#1798). 
* make get_value_estimates output a dict of floats
* Use np.append instead of convert to list, unconvert
* Add type hints and test for get_value_estimates
/develop-generalizationTraining-TrainerController
GitHub 5 年前
当前提交
a5b7cf95
共有 3 个文件被更改,包括 53 次插入7 次删除
  1. 18
      ml-agents/mlagents/trainers/ppo/policy.py
  2. 11
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 31
      ml-agents/mlagents/trainers/tests/test_ppo.py

18
ml-agents/mlagents/trainers/ppo/policy.py


import logging
import numpy as np
from typing import Any, Dict
import tensorflow as tf
from mlagents.envs.timers import timed
from mlagents.trainers import BrainInfo, ActionInfo

run_out = self._execute_model(feed_dict, self.update_dict)
return run_out
def get_value_estimates(self, brain_info, idx):
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool
) -> Dict[str, float]:
:param done: Whether or not this is the last element of the episode, in which case we want the value estimate to be 0.
feed_dict = {self.model.batch_size: 1, self.model.sequence_length: 1}
if done:
return {k: 0.0 for k in self.model.value_heads.keys()}
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [
brain_info.visual_observations[i][idx]

idx
].reshape([-1, len(self.model.act_size)])
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
return value_estimates
return {k: float(v) for k, v in value_estimates.items()}
def get_action(self, brain_info: BrainInfo) -> ActionInfo:
"""

11
ml-agents/mlagents/trainers/ppo/trainer.py


else:
bootstrapping_info = info
idx = l
value_next = self.policy.get_value_estimates(bootstrapping_info, idx)
if info.local_done[l] and not info.max_reached[l]:
value_next["extrinsic"] = 0.0
value_next = self.policy.get_value_estimates(
bootstrapping_info,
idx,
info.local_done[l] and not info.max_reached[l],
)
tmp_advantages = []
tmp_returns = []
for name in self.policy.reward_signals:

:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
value_estimates = np.asarray(value_estimates.tolist() + [value_next])
value_estimates = np.append(value_estimates, value_next)
delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage

31
ml-agents/mlagents/trainers/tests/test_ppo.py


@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher")
@mock.patch("mlagents.envs.UnityEnvironment.get_communicator")
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config):
tf.reset_default_graph()
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=0
)
env = UnityEnvironment(" ")
brain_infos = env.reset()
brain_info = brain_infos[env.brain_names[0]]
trainer_parameters = dummy_config
model_path = env.brain_names[0]
trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
policy = PPOPolicy(
0, env.brains[env.brain_names[0]], trainer_parameters, False, False
)
run_out = policy.get_value_estimates(brain_info, 0, done=False)
for key, val in run_out.items():
assert type(key) is str
assert type(val) is float
run_out = policy.get_value_estimates(brain_info, 0, done=True)
for key, val in run_out.items():
assert type(key) is str
assert val == 0.0
env.close()
@mock.patch("mlagents.envs.UnityEnvironment.executable_launcher")
@mock.patch("mlagents.envs.UnityEnvironment.get_communicator")
def test_ppo_model_cc_vector(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:

正在加载...
取消
保存