[skip ci] moving summary writer to update_policy

[skip ci] more fixes [skip ci] tweaking 3dball configs [skip ci] swap summary writer and step increment order
5 年前 · d49ceecc
--- a/config/trainer_config.yaml
+++ b/config/trainer_config.yaml

 3DBall:
    normalize: true
-    batch_size: 64
-    buffer_size: 12000
-    summary_freq: 12000
+    max_steps: 1.0e5

 3DBallHard:
    normalize: true
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        Processing involves calculating value and advantage targets for model updating step.
        :param trajectory: The Trajectory tuple containing the steps to be processed.
        """
-        super()._process_trajectory(trajectory)
+        # super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()
        Uses demonstration_buffer to update the policy.
        The reward signal generators must be updated in this method at their own pace.
        """
-        buffer_length = self.update_buffer.num_experiences
        self.cumulative_returns_since_policy_update.clear()

        # Make sure batch_size is a multiple of sequence length. During training, we
            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        )

-        # increment steps when training instead of when generating from environment
-        self._increment_step(self.trainer_parameters["buffer_size"], self.brain_name)
        num_epoch = self.trainer_parameters["num_epoch"]
        batch_update_stats = defaultdict(list)
        for _ in range(num_epoch):
            update_stats = self.optimizer.bc_module.update()
            for stat, val in update_stats.items():
                self._stats_reporter.add_stat(stat, val)
+
+        super()._update_policy()
        self._clear_update_buffer()

    def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
        """
        return False

-    @abc.abstractmethod
-        pass
+        # increment steps when training instead of when generating from environment
+        self._increment_step(self.trainer_parameters["buffer_size"], self.brain_name)
+        self._maybe_write_summary(self.get_step + self.trainer_parameters["buffer_size"])

    def _increment_step(self, n_steps: int, name_behavior_id: str) -> None:
        """
        Takes a trajectory and processes it, putting it into the update buffer.
        :param trajectory: The Trajectory tuple containing the steps to be processed.
        """
-        self._maybe_write_summary(self.get_step + len(trajectory.steps))
-        # self._increment_step(len(trajectory.steps), trajectory.behavior_id)
+        pass

    def _maybe_write_summary(self, step_after_process: int) -> None:
        """