浏览代码

Move policy to common location, remove epsilon

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
151e3b1c
共有 12 个文件被更改,包括 33 次插入48 次删除
  1. 8
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  2. 3
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  3. 6
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  4. 12
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  5. 14
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 4
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  7. 10
      ml-agents/mlagents/trainers/tests/test_multigpu.py
  8. 8
      ml-agents/mlagents/trainers/tests/test_ppo.py
  9. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  10. 12
      ml-agents/mlagents/trainers/common/nn_policy.py
  11. 0
      /ml-agents/mlagents/trainers/common/nn_policy.py

8
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
logger = logging.getLogger("mlagents.trainers")

"""
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)
:param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.
:param policy: The Policy object (e.g. NNPolicy) that this Reward Signal will apply to.
:param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
:param gamma: The time discounting factor used for this reward.
:return: A RewardSignal object.

)
def prepare_update(
self,
policy_model: LearningModel,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict

3
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel
from mlagents.trainers.tf_policy import TFPolicy
policy_model: LearningModel,
policy_model: TFPolicy,
encoding_size: int = 128,
learning_rate: float = 3e-4,
):

6
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
class CuriosityRewardSignal(RewardSignal):

super().check_config(config_dict, param_keys)
def prepare_update(
self,
policy: TFPolicy,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare for update and get feed_dict.

12
ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py


from tensorflow.python.client import device_lib
from mlagents.trainers.brain import BrainParameters
from mlagents_envs.timers import timed
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.components.reward_signals import RewardSignal
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,

logger = logging.getLogger("mlagents.trainers")
class MultiGpuPPOPolicy(PPOPolicy):
class MultiGpuNNPolicy(NNPolicy):
def __init__(
self,
seed: int,

load: bool,
):
self.towers: List[PPOPolicy] = []
self.towers: List[NNPolicy] = []
self.model: Optional[PPOPolicy] = None
self.model: Optional[NNPolicy] = None
self.total_policy_loss: Optional[tf.Tensor] = None
self.reward_signal_towers: List[Dict[str, RewardSignal]] = []
self.reward_signals: Dict[str, RewardSignal] = {}

for device in self.devices:
with tf.device(device):
self.towers.append(
PPOPolicy(
NNPolicy(
seed=seed,
brain=brain,
trainer_params=trainer_params,

reward_tower = {}
for reward_signal, config in reward_signal_configs.items():
reward_tower[reward_signal] = create_reward_signal(
self, self.towers[device_id], reward_signal, config
self.towers[device_id], reward_signal, config
)
for k, v in reward_tower[reward_signal].update_dict.items():
self.update_dict[k + "_" + str(device_id)] = v

14
ml-agents/mlagents/trainers/ppo/trainer.py


import numpy as np
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuNNPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.tf_policy import TFPolicy

self.load = load
self.multi_gpu = multi_gpu
self.seed = seed
self.policy: PPOPolicy = None # type: ignore
self.policy: NNPolicy = None # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""

"""
if self.multi_gpu and len(get_devices()) > 1:
policy: PPOPolicy = MultiGpuPPOPolicy(
policy: NNPolicy = MultiGpuNNPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,

else:
policy = PPOPolicy(
policy = NNPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,

self.__class__.__name__
)
)
if not isinstance(policy, PPOPolicy):
raise RuntimeError("Non-PPOPolicy passed to PPOTrainer.add_policy()")
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
self.policy.initialize_or_load()

4
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import yaml
import os
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.sac.policy import SACPolicy

)
policy = (
PPOPolicy(0, mock_brain, trainer_config, False, False)
NNPolicy(0, mock_brain, trainer_config, False, False)
if trainer_config["trainer"] == "ppo"
else SACPolicy(0, mock_brain, trainer_config, False, False)
)

10
ml-agents/mlagents/trainers/tests/test_multigpu.py


from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuNNPolicy
from mlagents.trainers.tests.mock_brain import create_mock_brainparams

trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
assert len(policy.towers) == len(mock_get_devices.return_value)

trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
with tf.Session() as sess:
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
var = tf.Variable(0)
tower_grads = [
[(tf.constant(0.1), var)],

@mock.patch("mlagents.trainers.tf_policy.TFPolicy._execute_model")
@mock.patch("mlagents.trainers.ppo.policy.PPOPolicy.construct_feed_dict")
@mock.patch("mlagents.trainers.common.nn_policy.NNPolicy.construct_feed_dict")
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
def test_update(
mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config

trainer_parameters["model_path"] = ""
trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
mock_mini_batch = mock.Mock()
mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])]
run_out = policy.update(mock_mini_batch, 1)

8
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.models import EncoderType, LearningModel
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.brain import BrainParameters, CameraResolution

model_path = brain_name
trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
policy = PPOPolicy(0, brain_params, trainer_parameters, False, False)
policy = NNPolicy(0, brain_params, trainer_parameters, False, False)
run_out = policy.evaluate(batched_step, list(batched_step.agent_id))
assert run_out["action"].shape == (3, 2)
env.close()

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
policy = PPOPolicy(0, brain_params, dummy_config, False, False)
policy = NNPolicy(0, brain_params, dummy_config, False, False)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,

trainer = PPOTrainer(
brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False
)
policy_mock = mock.Mock(spec=PPOPolicy)
policy_mock = mock.Mock(spec=NNPolicy)
step_count = (
5
) # 10 hacked because this function is no longer called through trainer

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import yaml
import os
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.sac.policy import SACPolicy

trainer_parameters["reward_signals"].update(reward_signal_config)
trainer_parameters["use_recurrent"] = use_rnn
if trainer_config["trainer"] == "ppo":
policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False)
policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
else:
policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
return policy

12
ml-agents/mlagents/trainers/common/nn_policy.py


logger = logging.getLogger("mlagents.trainers")
class PPOPolicy(TFPolicy):
class NNPolicy(TFPolicy):
def __init__(
self,
seed: int,

self.batch_size_ph: batched_step_result.n_agents(),
self.sequence_length_ph: 1,
}
epsilon = None
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(

if self.use_continuous_act:
epsilon = np.random.normal(
size=(batched_step_result.n_agents(), self.act_size[0])
)
feed_dict[self.epsilon] = epsilon
feed_dict = self.fill_eval_dict(feed_dict, batched_step_result)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out

sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
self.epsilon = tf.random_normal(tf.shape(mu))
# Clip and scale output to ensure actions are always within [-1, 1] range.
self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3

/ml-agents/mlagents/trainers/ppo/policy.py → /ml-agents/mlagents/trainers/common/nn_policy.py

正在加载...
取消
保存