Moved curriculum code out of environment code.

7 年前 · 8380f2f2
--- a/docs/Training-Curriculum-Learning.md
+++ b/docs/Training-Curriculum-Learning.md
 structure of the curriculum. Within it we can set at what points in the training process 
 our wall height will change, either based on the percentage of training steps which have 
 taken place, or what the average reward the agent has received in the recent past is. 
-Once these are in place, we simply launch ppo.py using the `–curriculum-file` flag to 
+Once these are in place, we simply launch learn.py using the `–curriculum-file` flag to 
 point to the JSON file, and PPO we will train using Curriculum Learning. Of course we can 
 then keep track of the current lesson and progress via TensorBoard.

--- a/python/unityagents/init.py
+++ b/python/unityagents/init.py
 from .environment import *
 from .brain import *
 from .exception import *
-from .curriculum import *
--- a/python/unityagents/environment.py
+++ b/python/unityagents/environment.py

 from .brain import BrainInfo, BrainParameters, AllBrainInfo
 from .exception import UnityEnvironmentException, UnityActionException, UnityTimeOutException
-from .curriculum import Curriculum

 from communicator_objects import UnityRLInput, UnityRLOutput, AgentActionProto,\
    EnvironmentParametersProto, UnityRLInitializationInput, UnityRLInitializationOutput,\

 class UnityEnvironment(object):
    def __init__(self, file_name=None, worker_id=0,
-                 base_port=5005, curriculum=None,
-                 seed=0, docker_training=False, no_graphics=False):
+                 base_port=5005, seed=0,
+                 docker_training=False, no_graphics=False):
        """
        Starts a new unity environment and establishes a connection with the environment.
        Notice: Currently communication between Unity and Python takes place over an open socket without authentication.
        self._num_brains = len(self._brain_names)
        self._num_external_brains = len(self._external_brain_names)
        self._resetParameters = dict(aca_params.environment_parameters.float_parameters) # TODO
-        self._curriculum = Curriculum(curriculum, self._resetParameters)
-
-    @property
-    def curriculum(self):
-        return self._curriculum

    @property
    def logfile_path(self):
        # return SocketCommunicator(worker_id, base_port)

    def __str__(self):
-        _new_reset_param = self._curriculum.get_config()
+        # Set reset parameters from trainer.
+        '''_new_reset_param = self._curriculum.get_config()
-            self._resetParameters[k] = _new_reset_param[k]
+            self._resetParameters[k] = _new_reset_param[k]'''
+        
-        Lesson number : {3}
-        Reset Parameters :\n\t\t{4}'''.format(self._academy_name, str(self._num_brains),
-                                 str(self._num_external_brains), self._curriculum.get_lesson_number,
-                                  "\n\t\t".join([str(k) + " -> " + str(self._resetParameters[k])
+        Reset Parameters :\n\t\t{3}'''.format(self._academy_name, str(self._num_brains),
+                                 str(self._num_external_brains),
+                                 "\n\t\t".join([str(k) + " -> " + str(self._resetParameters[k])
-    def reset(self, train_mode=True, config=None, lesson=None) -> AllBrainInfo:
+    def reset(self, config, train_mode=True, lesson=None) -> AllBrainInfo:
-        if config is None:
-            config = self._curriculum.get_config(lesson)
-        elif config != {}:
+        if config != {}:
            logger.info("\nAcademy Reset with parameters : \t{0}"
                        .format(', '.join([str(x) + ' -> ' + str(config[x]) for x in config])))
        for k in config:
--- a/python/unitytrainers/init.py
+++ b/python/unitytrainers/init.py
 from .buffer import *
+from .curriculum import *
 from .models import *
 from .trainer_controller import *
 from .bc.models import *
--- a/python/unitytrainers/ppo/trainer.py
+++ b/python/unitytrainers/ppo/trainer.py
        n_sequences = max(int(self.trainer_parameters['batch_size'] / self.sequence_length), 1)
        value_total, policy_total, forward_total, inverse_total = [], [], [], []
        advantages = self.training_buffer.update_buffer['advantages'].get_batch()
+        print('advantages:', advantages)
+        print('advantages mean:', advantages.mean())
+        print('advantages std:', advantages.std())
        self.training_buffer.update_buffer['advantages'].set(
            (advantages - advantages.mean()) / (advantages.std() + 1e-10))
        num_epoch = self.trainer_parameters['num_epoch']
--- a/python/unitytrainers/trainer_controller.py
+++ b/python/unitytrainers/trainer_controller.py
 from unitytrainers.bc.trainer import BehavioralCloningTrainer
 from unityagents import UnityEnvironment, UnityEnvironmentException

+from .curriculum import Curriculum
+from unityagents.exception import UnityEnvironmentException, UnityActionException, UnityTimeOutException
+
+from communicator_objects import UnityRLInitializationInput
+

 class TrainerController(object):
    def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train,
        np.random.seed(self.seed)
        tf.set_random_seed(self.seed)
        self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id,
-                                    curriculum=self.curriculum_file, seed=self.seed,
-                                    docker_training=self.docker_training,
+                                    seed=self.seed, docker_training=self.docker_training,
+        self.curriculum = Curriculum(curriculum_file, self.env._resetParameters)
+        self.env.reset(config=self.curriculum.get_config(), train_mode=self.fast_simulation)
-            if self.env.curriculum.measure_type == "progress":
+            if self.curriculum.measure_type == "progress":
-            elif self.env.curriculum.measure_type == "reward":
+            elif self.curriculum.measure_type == "reward":
                for brain_name in self.env.external_brain_names:
                    progress += self.trainers[brain_name].get_last_reward
                return progress
                                            .format(model_path))

    def start_learning(self):
-        self.env.curriculum.set_lesson_number(self.lesson)
+        self.curriculum.set_lesson_number(self.lesson)
        trainer_config = self._load_config()
        self._create_model_path(self.model_path)

            else:
                sess.run(init)
            global_step = 0  # This is only for saving the model
-            self.env.curriculum.increment_lesson(self._get_progress())
-            curr_info = self.env.reset(train_mode=self.fast_simulation)
+            self.curriculum.increment_lesson(self._get_progress())
+            curr_info = self.env.reset(config=self.curriculum.get_config(), train_mode=self.fast_simulation)
            if self.train_model:
                for brain_name, trainer in self.trainers.items():
                    trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
-                        self.env.curriculum.increment_lesson(self._get_progress())
-                        curr_info = self.env.reset(train_mode=self.fast_simulation)
+                        self.curriculum.increment_lesson(self._get_progress())
+                        curr_info = self.env.reset(config=self.curriculum.get_config(), train_mode=self.fast_simulation)
                        for brain_name, trainer in self.trainers.items():
                            trainer.end_episode()
                    # Decide and take an action
                            # Perform gradient descent with experience buffer
                            trainer.update_model()
                        # Write training statistics to Tensorboard.
-                        trainer.write_summary(self.env.curriculum.lesson_number)
+                        trainer.write_summary(self.curriculum.lesson_number)
                        if self.train_model and trainer.get_step <= trainer.get_max_steps:
                            trainer.increment_step_and_update_last_reward()
                    if self.train_model:
--- a/python/unitytrainers/curriculum.py
+++ b/python/unitytrainers/curriculum.py
 import json

-from .exception import UnityEnvironmentException
+from unityagents.exception import UnityEnvironmentException

 import logging


    def increment_lesson(self, progress):
        """
-        Increments the lesson number depending on the progree given.
+        Increments the lesson number depending on the progress given.
        :param progress: Measure of progress (either reward or percentage steps completed).
        """
        if self.data is None or progress is None:
--- a/python/curricula/push_curriculum.json
+++ b/python/curricula/push_curriculum.json
+{
+    "measure" : "progress",
+    "thresholds" : [0.1],
+    "min_lesson_length" : 2,
+    "signal_smoothing" : true, 
+    "parameters" : 
+    {
+        "goal_size" : [25.0, 5.0]
+    }
+}
--- a//python/unitytrainers/curriculum.py
+++ b//python/unitytrainers/curriculum.py