浏览代码

Simplify creation of optimizer, breaks multi-GPU

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
e912fa47
共有 4 个文件被更改,包括 24 次插入63 次删除
  1. 4
      ml-agents/mlagents/trainers/optimizer.py
  2. 33
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  3. 31
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 19
      ml-agents/mlagents/trainers/ppo/policy.py

4
ml-agents/mlagents/trainers/optimizer.py


from mlagents.tf_utils.tf import tf
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.components.reward_signals.reward_signal_factory import (

class TFOptimizer(Optimizer, abc.ABC): # pylint: disable=W0223
def __init__(
self, sess: tf.Session, policy: PPOModel, reward_signal_configs: Dict[str, Any]
self, sess: tf.Session, policy: TFPolicy, reward_signal_configs: Dict[str, Any]
):
super().__init__(policy, reward_signal_configs)
self.sess = sess

33
ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py


from tensorflow.python.client import device_lib
from mlagents.trainers.brain import BrainParameters
from mlagents_envs.timers import timed
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.components.reward_signals import RewardSignal
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,

is_training: bool,
load: bool,
):
self.towers: List[PPOModel] = []
self.towers: List[PPOPolicy] = []
self.model: Optional[PPOModel] = None
self.model: Optional[PPOPolicy] = None
self.total_policy_loss: Optional[tf.Tensor] = None
self.reward_signal_towers: List[Dict[str, RewardSignal]] = []
self.reward_signals: Dict[str, RewardSignal] = {}

for device in self.devices:
with tf.device(device):
self.towers.append(
PPOModel(
PPOPolicy(
seed=seed,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get(
"learning_rate_schedule", "linear"
)
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=trainer_params["normalize"],
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
trainer_params=trainer_params,
is_training=is_training,
load=load,
)
)
self.towers[-1].create_ppo_optimizer()

for batch, tower, reward_tower in zip(
device_batches, self.towers, self.reward_signal_towers
):
feed_dict.update(self.construct_feed_dict(tower, batch, num_sequences))
# feed_dict.update(self.construct_feed_dict(tower, batch, num_sequences)) TODO: Fix multi-GPU optimizer
stats_needed.update(self.stats_name_to_update_name)
for _, reward_signal in reward_tower.items():
feed_dict.update(

31
ml-agents/mlagents/trainers/ppo/optimizer.py


class PPOOptimizer(TFOptimizer):
def __init__(
self,
brain,
sess,
policy,
reward_signal_configs,
lr=1e-4,
lr_schedule=LearningRateSchedule.LINEAR,
h_size=128,
epsilon=0.2,
beta=1e-3,
max_step=5e6,
normalize=False,
use_recurrent=False,
num_layers=2,
m_size=None,
seed=0,
vis_encode_type=EncoderType.SIMPLE,
):
def __init__(self, brain, sess, policy, reward_signal_configs, trainer_params):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

:return: a sub-class of PPOAgent tailored to the environment.
"""
super().__init__(sess, policy, reward_signal_configs)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])
epsilon = float(trainer_params["epsilon"])
beta = float(trainer_params["beta"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(trainer_params.get("vis_encode_type", "simple"))
self.stream_names = self.reward_signals.keys()

19
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents_envs.timers import timed
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import LearningModel
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.tf_policy import TFPolicy

policy=self,
sess=self.sess,
reward_signal_configs=reward_signal_configs,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=False,
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
trainer_params=trainer_params,
)
self.optimizer.create_ppo_optimizer()

正在加载...
取消
保存