比较提交

...
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/learn.py
/ml-agents/mlagents/trainers/trainer_controller.py
/ml-agents/mlagents/trainers/stats.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/sac/trainer.py
/ml-agents/mlagents/trainers/trainer/rl_trainer.py
/config/trainer_config.yaml
/ml-agents/mlagents/tf_utils/tf.py
/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
/ml-agents/mlagents/trainers/policy/tf_policy.py

27 次代码提交

作者 SHA1 备注 提交日期
Anupam Bhatnagar 9d7dd3b6 [skip ci] moving step increment to trainer from environment for sac 5 年前
Anupam Bhatnagar 5055577f Merge branch 'distributed-training' of github.com:Unity-Technologies/ml-agents into distributed-training 5 年前
Anupam Bhatnagar 2c68e921 [skip ci] fix first summary statement output 5 年前
Anupam Bhatnagar 4b960034 [skip ci] swap summary writer and step increment order 5 年前
Anupam Bhatnagar 86e16a64 [skip ci] tweaking 3dball configs 5 年前
Anupam Bhatnagar 45bac63e [skip ci] more fixes 5 年前
Anupam Bhatnagar 63abbe71 [skip ci] moving summary writer to update_policy 5 年前
Anupam Bhatnagar e8d09d00 [skip ci] increment steps on training 5 年前
Anupam Bhatnagar 95ba923d [skip ci] fix first summary statement output 5 年前
Anupam Bhatnagar 0d231585 [skip ci] fix typo 5 年前
Anupam Bhatnagar d49ceecc [skip ci] moving summary writer to update_policy 5 年前
Anupam Bhatnagar 56d82152 [skip ci] adding horovod rank 5 年前
Anupam Bhatnagar ac80ec82 [skip ci] increment steps on training 5 年前
Anupam Bhatnagar 7ae32cc2 [skip ci] replace buffer length by buffer size 5 年前
Anupam Bhatnagar 76f9b219 [skip ci] adding horovod rank 5 年前
Anupam Bhatnagar b3c2d431 [skip ci] minor formatting change 5 年前
Anupam Bhatnagar eb9f3f19 [skip ci] replace buffer length by buffer size 5 年前
Anupam Bhatnagar 5d180caf [skip ci] modify learning rate in horovod optimizer 5 年前
Anupam Bhatnagar 06a54ae8 step increment moved to _update_policy, fixed exit status issue 5 年前
Anupam Bhatnagar c76c2b0d branch rebased 5 年前
Anupam Bhatnagar c49cc069 removing logging statements 5 年前
Anupam Bhatnagar 13560d9a [skip ci] adding bouncer jobs with fewer steps 5 年前
Anupam Bhatnagar d94ae012 [skip ci] shorter 3dball run 5 年前
Anupam Bhatnagar f36108a9 [skip ci] continue training until worker-0 is done 5 年前
Anupam Bhatnagar 06c6de13 activate environment from executable 5 年前
Anupam Bhatnagar 9341f7a2 [skip-ci] small refactors 5 年前
Anupam Bhatnagar 001fce2a first commit 5 年前
共有 10 个文件被更改,包括 62 次插入15 次删除
  1. 4
      config/trainer_config.yaml
  2. 3
      ml-agents/mlagents/tf_utils/tf.py
  3. 7
      ml-agents/mlagents/trainers/trainer_controller.py
  4. 18
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  5. 2
      ml-agents/mlagents/trainers/learn.py
  6. 5
      ml-agents/mlagents/trainers/policy/tf_policy.py
  7. 7
      ml-agents/mlagents/trainers/stats.py
  8. 11
      ml-agents/mlagents/trainers/ppo/trainer.py
  9. 14
      ml-agents/mlagents/trainers/sac/trainer.py
  10. 6
      ml-agents/mlagents/trainers/trainer/rl_trainer.py

4
config/trainer_config.yaml


3DBall:
normalize: true
batch_size: 64
buffer_size: 12000
summary_freq: 12000
max_steps: 1.0e5
3DBallHard:
normalize: true

3
ml-agents/mlagents/tf_utils/tf.py


# Everywhere else is caught by the banned-modules setting for flake8
import tensorflow as tf # noqa I201
from distutils.version import LooseVersion
import horovod.tensorflow as hvd
# LooseVersion handles things "1.2.3a" or "4.5.6-rc7" fairly sensibly.

"""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# For multi-GPU training, set allow_soft_placement to True to allow
# placing the operation into an alternative device automatically
# to prevent from exceptions if the device doesn't suppport the operation

7
ml-agents/mlagents/trainers/trainer_controller.py


from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManager
import horovod.tensorflow as hvd
class TrainerController(object):

"""
Saves current model to checkpoint folder.
"""
if hvd.rank() != 0:
return
for brain_name in self.trainers.keys():
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
self.trainers[brain_name].save_model(name_behavior_id)

"""
Exports latest saved models to .nn format for Unity embedding.
"""
if hvd.rank() != 0:
return
for brain_name in self.trainers.keys():
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
self.trainers[brain_name].export_model(name_behavior_id)

18
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


from mlagents.trainers.components.bc.module import BCModule
try:
import horovod.tensorflow as hvd
except ImportError:
hvd = None
class TFOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
self.sess = policy.sess

def create_optimizer_op(
self, learning_rate: tf.Tensor, name: str = "Adam"
) -> tf.train.Optimizer:
return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)
if hvd is not None:
adam_optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate, name=name
)
horovod_optimizer = hvd.DistributedOptimizer(adam_optimizer)
else:
adam_optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate, name=name
)
return horovod_optimizer if hvd is not None else adam_optimizer
def _execute_model(
self, feed_dict: Dict[tf.Tensor, np.ndarray], out_dict: Dict[str, tf.Tensor]

2
ml-agents/mlagents/trainers/learn.py


from mlagents_envs.exception import UnityEnvironmentException
from mlagents_envs.timers import hierarchical_timer, get_timer_tree
from mlagents_envs import logging_util
import horovod.tensorflow as hvd
logger = logging_util.get_logger(__name__)

sampler_manager, resampling_interval = create_sampler_manager(
options.sampler_config, run_seed
)
hvd.init()
trainer_factory = TrainerFactory(
options.trainer_config,
summaries_dir,

5
ml-agents/mlagents/trainers/policy/tf_policy.py


from mlagents.trainers.brain_conversion_utils import get_global_agent_id
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.models import ModelUtils
import horovod.tensorflow as hvd
logger = get_logger(__name__)

self._load_graph(self.model_path, reset_global_steps=reset_steps)
else:
self._initialize_graph()
self.sess.run(hvd.broadcast_global_variables(0))
def get_weights(self):
with self.graph.as_default():

:param steps: The number of steps the model was trained for
:return:
"""
if hvd.rank() != 0:
return
with self.graph.as_default():
last_checkpoint = self.model_path + "/model-" + str(steps) + ".ckpt"
self.saver.save(self.sess, last_checkpoint)

7
ml-agents/mlagents/trainers/stats.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import set_gauge
from mlagents.tf_utils import tf, generate_session_config
import horovod.tensorflow as hvd
logger = get_logger(__name__)

) -> None:
is_training = "Not Training."
if "Is Training" in values:
stats_summary = stats_summary = values["Is Training"]
stats_summary = values["Is Training"]
rank = hvd.rank()
"{}: Step: {}. "
"Horovod Rank: {}, {}: Step: {}. "
rank,
category,
step,
time.time() - self.training_start_time,

11
ml-agents/mlagents/trainers/ppo/trainer.py


Processing involves calculating value and advantage targets for model updating step.
:param trajectory: The Trajectory tuple containing the steps to be processed.
"""
super()._process_trajectory(trajectory)
# super()._process_trajectory(trajectory)
agent_id = trajectory.agent_id # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()

Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
buffer_length = self.update_buffer.num_experiences
self._maybe_write_summary(
self.get_step + self.trainer_parameters["buffer_size"]
)
self._increment_step(self.trainer_parameters["buffer_size"], self.brain_name)
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.

self.update_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
max_num_batch = buffer_length // batch_size
max_num_batch = self.trainer_parameters["buffer_size"] // batch_size
for l in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(l, l + batch_size), n_sequences

update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:

14
ml-agents/mlagents/trainers/sac/trainer.py


"""
self.cumulative_returns_since_policy_update.clear()
self._maybe_write_summary(
self.get_step
+ self.trainer_parameters["num_update"]
* self.trainer_parameters["batch_size"]
)
self._increment_step(
self.trainer_parameters["num_update"]
* self.trainer_parameters["batch_size"],
self.brain_name,
)
num_updates = self.trainer_parameters["num_update"]
for _ in range(num_updates):
for _ in range(self.trainer_parameters["num_update"]):
logger.debug("Updating SAC policy at step {}".format(self.step))
buffer = self.update_buffer
if (

6
ml-agents/mlagents/trainers/trainer/rl_trainer.py


"""
return False
@abc.abstractmethod
def _update_policy(self):
"""
Uses demonstration_buffer to update model.

Takes a trajectory and processes it, putting it into the update buffer.
:param trajectory: The Trajectory tuple containing the steps to be processed.
"""
self._maybe_write_summary(self.get_step + len(trajectory.steps))
self._increment_step(len(trajectory.steps), trajectory.behavior_id)
pass
def _maybe_write_summary(self, step_after_process: int) -> None:
"""

"""
if step_after_process >= self.next_summary_step and self.get_step != 0:
if step_after_process >= self.next_summary_step:
self._write_summary(self.next_summary_step)
def advance(self) -> None:

正在加载...
取消
保存