Add comments and alphabetize flags

7 年前 · 75ea16ff
--- a/python/ppo.py
+++ b/python/ppo.py

 Options:
  --help                     Show this message.
-  --curriculum=<file>        Curriculum json file for environment [default: None]
+  --batch-size=<n>           How many experiences per gradient descent update step [default: 64].
+  --beta=<n>                 Strength of entropy regularization [default: 5e-3].
+  --buffer-size=<n>          How large the experience buffer should be before gradient descent [default: 2048].
+  --curriculum=<file>        Curriculum json file for environment [default: None].
+  --epsilon=<n>              Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
+  --gamma=<n>                Reward discount rate [default: 0.995].
+  --hidden-units=<n>         Number of units in hidden layer [default: 64].
+  --keep-checkpoints=<n>     How many model checkpoints to keep [default: 5].
+  --lambd=<n>                Lambda parameter for GAE [default: 0.95].
+  --learning-rate=<rate>     Model learning rate [default: 3e-4].
+  --load                     Whether to load the model or randomly initialize [default: False].
+  --num-epoch=<n>            Number of gradient descent steps per batch of experiences [default: 5].
-  --load                     Whether to load the model or randomly initialize [default: False].
-  --train                    Whether to train model, or only run inference [default: False].
-  --summary-freq=<n>         Frequency at which to save training statistics [default: 10000].
-  --gamma=<n>                Reward discount rate [default: 0.995].
-  --lambd=<n>                Lambda parameter for GAE [default: 0.95].
+  --summary-freq=<n>         Frequency at which to save training statistics [default: 10000].
-  --beta=<n>                 Strength of entropy regularization [default: 1e-3].
-  --num-epoch=<n>            Number of gradient descent steps per batch of experiences [default: 5].
-  --epsilon=<n>              Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
-  --buffer-size=<n>          How large the experience buffer should be before gradient descent [default: 2048].
-  --learning-rate=<rate>     Model learning rate [default: 3e-4].
-  --hidden-units=<n>         Number of units in hidden layer [default: 64].
-  --batch-size=<n>           How many experiences per gradient descent update step [default: 64].
-  --keep-checkpoints=<n>     How many model checkpoints to keep [default: 5].
-  --worker-id=<n>            Number to add to communication port (5005). Used for asynchronous agent scenarios [default: 0].
+  --train                    Whether to train model, or only run inference [default: False].
+  --worker-id=<n>            Number to add to communication port (5005). Used for multi-environment [default: 0].
 '''

 options = docopt(_USAGE)
--- a/python/ppo/models.py
+++ b/python/ppo/models.py

 class PPOModel(object):
    def create_global_steps(self):
+        """Creates TF ops to track and increment global training step."""
+        """Creates TF ops to track and increment recent average cumulative reward."""
        self.last_reward = tf.Variable(0, name="last_reward", trainable=False, dtype=tf.float32)
        self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
        self.update_reward = tf.assign(self.last_reward, self.new_reward)
--- a/python/ppo/trainer.py
+++ b/python/ppo/trainer.py
 class Trainer(object):
    def __init__(self, ppo_model, sess, info, is_continuous, use_observations, use_states, training):
        """
-        Responsible for collecting experinces and training PPO model.
+        Responsible for collecting experiences and training PPO model.
        :param ppo_model: Tensorflow graph defining model.
        :param sess: Tensorflow session.
        :param info: Environment BrainInfo object.
        self.use_states = use_states

    def running_average(self, data, steps, running_mean, running_variance):
+        """
+        Computes new running mean and variances.
+        :param data: New piece of data.
+        :param steps: Total number of data so far.
+        :param running_mean: TF op corresponding to stored running mean.
+        :param running_variance: TF op corresponding to stored running variance.
+        :return: New mean and variance values.
+        """
        mean, var = self.sess.run([running_mean, running_variance])
        current_x = np.mean(data, axis=0)
        new_mean = mean + (current_x - mean) / (steps + 1)
                    history['episode_steps'] = 0

    def reset_buffers(self, brain_info=None, total=False):
+        """
+        Resets either all training buffers or local training buffers
+        :param brain_info: The BrainInfo object containing agent ids.
+        :param total: Whether to completely clear buffer.
+        """
        self.training_buffer = vectorize_history(empty_local_history({}))
        if not total:
            for key in self.history_dict:
--- a/python/unityagents/curriculum.py
+++ b/python/unityagents/curriculum.py

 class Curriculum(object):
    def __init__(self, location, default_reset_parameters):
+        """
+        Initializes a Curriculum object.
+        :param location: Path to JSON defining curriculum.
+        :param default_reset_parameters: Set of reset parameters for environment.
+        """
        self.lesson_number = 0
        self.lesson_length = 0
        self.measure_type = None
        self.lesson_number = max(0, min(value, self.max_lesson_number))

    def get_lesson(self, progress):
+        """
+        Returns reset parameters which correspond to current lesson.
+        :param progress: Measure of progress (either reward or percentage steps completed).
+        :return: Dictionary containing reset parameters.
+        """
        if self.data is None or progress is None:
            return {}
        if self.data["signal_smoothing"]:
--- a/python/curricula/wall.json
+++ b/python/curricula/wall.json
+{
+    "measure" : "reward",
+    "thresholds" : [0.7, 0.7, 0.7, 0.6, 0.6, 0.6, 0.5, 0.5, 0.5],
+    "min_lesson_length" : 3,
+    "signal_smoothing" : true, 
+    "parameters" : 
+    {
+        "min_wall_height" : [0, 0, 1, 1, 2, 2, 3, 3, 4, 4],
+        "max_wall_height" : [0, 1, 1, 2, 2, 3, 3, 4, 4, 5]
+    }
+}