浏览代码

Add comments and alphabetize flags

/tag-0.2.0
Arthur Juliani 7 年前
当前提交
75ea16ff
共有 5 个文件被更改,包括 52 次插入16 次删除
  1. 30
      python/ppo.py
  2. 2
      python/ppo/models.py
  3. 15
      python/ppo/trainer.py
  4. 10
      python/unityagents/curriculum.py
  5. 11
      python/curricula/wall.json

30
python/ppo.py


Options:
--help Show this message.
--curriculum=<file> Curriculum json file for environment [default: None]
--batch-size=<n> How many experiences per gradient descent update step [default: 64].
--beta=<n> Strength of entropy regularization [default: 5e-3].
--buffer-size=<n> How large the experience buffer should be before gradient descent [default: 2048].
--curriculum=<file> Curriculum json file for environment [default: None].
--epsilon=<n> Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
--gamma=<n> Reward discount rate [default: 0.995].
--hidden-units=<n> Number of units in hidden layer [default: 64].
--keep-checkpoints=<n> How many model checkpoints to keep [default: 5].
--lambd=<n> Lambda parameter for GAE [default: 0.95].
--learning-rate=<rate> Model learning rate [default: 3e-4].
--load Whether to load the model or randomly initialize [default: False].
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
--load Whether to load the model or randomly initialize [default: False].
--train Whether to train model, or only run inference [default: False].
--summary-freq=<n> Frequency at which to save training statistics [default: 10000].
--gamma=<n> Reward discount rate [default: 0.995].
--lambd=<n> Lambda parameter for GAE [default: 0.95].
--summary-freq=<n> Frequency at which to save training statistics [default: 10000].
--beta=<n> Strength of entropy regularization [default: 1e-3].
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
--epsilon=<n> Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
--buffer-size=<n> How large the experience buffer should be before gradient descent [default: 2048].
--learning-rate=<rate> Model learning rate [default: 3e-4].
--hidden-units=<n> Number of units in hidden layer [default: 64].
--batch-size=<n> How many experiences per gradient descent update step [default: 64].
--keep-checkpoints=<n> How many model checkpoints to keep [default: 5].
--worker-id=<n> Number to add to communication port (5005). Used for asynchronous agent scenarios [default: 0].
--train Whether to train model, or only run inference [default: False].
--worker-id=<n> Number to add to communication port (5005). Used for multi-environment [default: 0].
'''
options = docopt(_USAGE)

2
python/ppo/models.py


class PPOModel(object):
def create_global_steps(self):
"""Creates TF ops to track and increment global training step."""
"""Creates TF ops to track and increment recent average cumulative reward."""
self.last_reward = tf.Variable(0, name="last_reward", trainable=False, dtype=tf.float32)
self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
self.update_reward = tf.assign(self.last_reward, self.new_reward)

15
python/ppo/trainer.py


class Trainer(object):
def __init__(self, ppo_model, sess, info, is_continuous, use_observations, use_states, training):
"""
Responsible for collecting experinces and training PPO model.
Responsible for collecting experiences and training PPO model.
:param ppo_model: Tensorflow graph defining model.
:param sess: Tensorflow session.
:param info: Environment BrainInfo object.

self.use_states = use_states
def running_average(self, data, steps, running_mean, running_variance):
"""
Computes new running mean and variances.
:param data: New piece of data.
:param steps: Total number of data so far.
:param running_mean: TF op corresponding to stored running mean.
:param running_variance: TF op corresponding to stored running variance.
:return: New mean and variance values.
"""
mean, var = self.sess.run([running_mean, running_variance])
current_x = np.mean(data, axis=0)
new_mean = mean + (current_x - mean) / (steps + 1)

history['episode_steps'] = 0
def reset_buffers(self, brain_info=None, total=False):
"""
Resets either all training buffers or local training buffers
:param brain_info: The BrainInfo object containing agent ids.
:param total: Whether to completely clear buffer.
"""
self.training_buffer = vectorize_history(empty_local_history({}))
if not total:
for key in self.history_dict:

10
python/unityagents/curriculum.py


class Curriculum(object):
def __init__(self, location, default_reset_parameters):
"""
Initializes a Curriculum object.
:param location: Path to JSON defining curriculum.
:param default_reset_parameters: Set of reset parameters for environment.
"""
self.lesson_number = 0
self.lesson_length = 0
self.measure_type = None

self.lesson_number = max(0, min(value, self.max_lesson_number))
def get_lesson(self, progress):
"""
Returns reset parameters which correspond to current lesson.
:param progress: Measure of progress (either reward or percentage steps completed).
:return: Dictionary containing reset parameters.
"""
if self.data is None or progress is None:
return {}
if self.data["signal_smoothing"]:

11
python/curricula/wall.json


{
"measure" : "reward",
"thresholds" : [0.7, 0.7, 0.7, 0.6, 0.6, 0.6, 0.5, 0.5, 0.5],
"min_lesson_length" : 3,
"signal_smoothing" : true,
"parameters" :
{
"min_wall_height" : [0, 0, 1, 1, 2, 2, 3, 3, 4, 4],
"max_wall_height" : [0, 1, 1, 2, 2, 3, 3, 4, 4, 5]
}
}
正在加载...
取消
保存