sac update

4 年前 · 0a1a30d3
--- a/config/sac_transfer/3DBall.yaml
+++ b/config/sac_transfer/3DBall.yaml
    trainer_type: sac_transfer
    hyperparameters:
      learning_rate: 0.0003
-      learning_rate_schedule: constant
-      model_schedule: constant
+      learning_rate_schedule: linear
+      model_schedule: linear
      batch_size: 64
      buffer_size: 12000
      buffer_init_steps: 0
      forward_layers: 1
      value_layers: 2
      feature_size: 16
-      separate_value_train: true
-      reuse_encoder: true
+      reuse_encoder: false
      in_epoch_alter: false
      in_batch_alter: true
      use_op_buffer: false
-      use_bisim: false
+      use_bisim: true
-      normalize: false
+      normalize: true
      hidden_units: 64
      num_layers: 2
      vis_encode_type: simple
--- a/config/sac_transfer/3DBallHard.yaml
+++ b/config/sac_transfer/3DBallHard.yaml
    trainer_type: sac_transfer
    hyperparameters:
      learning_rate: 0.0003
-      learning_rate_schedule: constant
+      learning_rate_schedule: linear
      batch_size: 256
      buffer_size: 50000
      buffer_init_steps: 0
      use_var_predict: true
      with_prior: false
      predict_return: true
-      use_bisim: false
+      use_bisim: true
    network_settings:
      normalize: true
      hidden_units: 128
--- a/config/sac_transfer/3DBallHardTransfer.yaml
+++ b/config/sac_transfer/3DBallHardTransfer.yaml
    trainer_type: sac_transfer
    hyperparameters:
      learning_rate: 0.0003
-      learning_rate_schedule: constant
+      learning_rate_schedule: linear
      batch_size: 256
      buffer_size: 50000
      buffer_init_steps: 0
      use_var_predict: true
      with_prior: false
      predict_return: true
-      use_bisim: false
+      use_bisim: true
-      load_policy: false
-      load_value: false
-      transfer_path: "results/"
+      transfer_path: "results/sac_model_ball_sep_bisim/3DBall"
    network_settings:
      normalize: true
      hidden_units: 128
--- a/ml-agents/mlagents/trainers/policy/transfer_policy.py
+++ b/ml-agents/mlagents/trainers/policy/transfer_policy.py
        forward_layers: int,
        var_predict: bool = False,
        reuse: bool = False,
-        separate_train: bool = False,
    ) -> None:
        """
        Creates forward model TensorFlow ops for Curiosity module.
        """
        combined_input = tf.concat([encoded_state, encoded_action], axis=1)
        hidden = combined_input
+
+        if not self.transfer:
+            hidden = tf.stop_gradient(hidden)

        for i in range(forward_layers):
            hidden = tf.layers.dense(
        encoded_state: tf.Tensor,
        encoded_action: tf.Tensor,
        forward_layers: int,
-        separate_train: bool = False,
-        # if self.transfer:
-        #    hidden = tf.stop_gradient(hidden)
+        if not self.transfer:
+           hidden = tf.stop_gradient(hidden)
        for i in range(forward_layers):
            hidden = tf.layers.dense(
                hidden,
        self.bisim_action = tf.placeholder(
            shape=[None, sum(self.act_size)], dtype=tf.float32, name="bisim_action"
        )
-        self.bisim_action_encoder = self._create_action_encoder(
-            self.bisim_action,
-            self.h_size,
-            self.action_feature_size,
-            action_layers,
-            reuse=True,
-        )
-        combined_input = tf.concat([self.bisim_encoder, self.bisim_action_encoder], axis=1)
+        # self.bisim_action_encoder = self._create_action_encoder(
+        #     self.bisim_action,
+        #     self.h_size,
+        #     self.action_feature_size,
+        #     action_layers,
+        #     reuse=True,
+        # )
+        combined_input = tf.concat([self.bisim_encoder, self.bisim_action], axis=1)
        combined_input = tf.stop_gradient(combined_input)

        with tf.variable_scope("predict"):
--- a/ml-agents/mlagents/trainers/sac_transfer/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac_transfer/optimizer.py
            self.stats_name_to_update_name.update({
                "Losses/Reward Loss": "reward_loss",
            })
-        if self.use_bisim:
-            self.stats_name_to_update_name.update({
-                "Losses/Bisim Loss": "bisim_loss",
-                "Policy/Bisim Learning Rate": "bisim_learning_rate",
-            })

        self.update_dict = {
            "value_loss": self.total_value_loss,
        policy_vars = self.policy.get_trainable_variables(
            train_encoder=self.train_encoder,
            train_action=self.train_action,
-            train_model=False,
+            train_model=self.train_model,
            train_policy=self.train_policy
        )

            train_model=self.train_model,
-            train_policy=False
+            train_policy=self.train_policy
        )

        if self.train_value:
        # Make sure policy is updated first, then value, then entropy.
        with tf.control_dependencies([self.update_batch_policy]):
            self.update_batch_value = value_optimizer.minimize(
-                self.total_value_loss, var_list=critic_vars
+                self.total_value_loss, var_list=self.policy_network.critic_vars
            )
            # Add entropy coefficient optimization operation
            with tf.control_dependencies([self.update_batch_value]):
            logger.debug(_var)

    @timed
-    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+    def update(self, batch: AgentBuffer, batch_bisim: AgentBuffer, num_sequences: int) -> Dict[str, float]:
        """
        Updates model using buffer.
        :param num_sequences: Number of trajectories in batch.
        feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
        stats_needed = self.stats_name_to_update_name
        update_stats: Dict[str, float] = {}
-        update_vals = self._execute_model(feed_dict, self.update_dict)
-        update_vals.update(self._execute_model(feed_dict, self.model_update_dict))
-
-        if self.use_bisim:
-            batch1 = copy.deepcopy(batch)
-            batch.shuffle(sequence_length=1)
-            batch2 = copy.deepcopy(batch)
-            bisim_stats = self.update_encoder(batch1, batch2)
+        update_vals = self._execute_model(feed_dict, self.model_update_dict)
+        update_vals.update(self._execute_model(feed_dict, self.update_dict))
+
+        if self.use_bisim:
+            bisim_stats = self.update_encoder(batch, batch_bisim)
+            update_stats.update(bisim_stats)
+        
        # Update target network. By default, target update happens at every policy update.
        self.sess.run(self.target_update_op)
        self.policy.run_soft_copy()
--- a/ml-agents/mlagents/trainers/sac_transfer/trainer.py
+++ b/ml-agents/mlagents/trainers/sac_transfer/trainer.py
                    self.hyperparameters.batch_size,
                    sequence_length=self.policy.sequence_length,
                )
+                sampled_minibatch_bisim = buffer.sample_mini_batch(
+                    self.hyperparameters.batch_size,
+                    sequence_length=self.policy.sequence_length,
+                )
                # Get rewards for each reward
                for name, signal in self.optimizer.reward_signals.items():
                    sampled_minibatch[
-                update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
+                update_stats = self.optimizer.update(
+                    sampled_minibatch, 
+                    sampled_minibatch_bisim, 
+                    n_sequences)
                for stat_name, value in update_stats.items():
                    batch_update_stats[stat_name].append(value)

--- a/config/sac_transfer/CrawlerStatic.yaml
+++ b/config/sac_transfer/CrawlerStatic.yaml
+behaviors:
+  CrawlerStatic:
+    trainer_type: sac_transfer
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+      encoder_layers: 2
+      policy_layers: 1
+      forward_layers: 2
+      value_layers: 3
+      feature_size: 128
+      reuse_encoder: false
+      in_epoch_alter: false
+      in_batch_alter: true
+      use_op_buffer: false
+      use_var_predict: true
+      with_prior: false
+      predict_return: true
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.995
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true
--- a/config/sac_transfer/CrawlerStaticTransfer.yaml
+++ b/config/sac_transfer/CrawlerStaticTransfer.yaml
+behaviors:
+  CrawlerStatic:
+    trainer_type: sac_transfer
+    hyperparameters:
+      learning_rate: 0.0003
+      learning_rate_schedule: constant
+      batch_size: 256
+      buffer_size: 500000
+      buffer_init_steps: 2000
+      tau: 0.005
+      steps_per_update: 20.0
+      save_replay_buffer: false
+      init_entcoef: 1.0
+      reward_signal_steps_per_update: 20.0
+      encoder_layers: 2
+      policy_layers: 1
+      forward_layers: 2
+      value_layers: 3
+      feature_size: 128
+      reuse_encoder: false
+      in_epoch_alter: false
+      in_batch_alter: true
+      use_op_buffer: false
+      use_var_predict: true
+      with_prior: false
+      predict_return: true
+      use_transfer: true
+      load_model: true
+      train_model: false
+      transfer_path: "results/cs-sacmod-old/3DBall"
+    network_settings:
+      normalize: true
+      hidden_units: 512
+      num_layers: 3
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.995
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 3000000
+    time_horizon: 1000
+    summary_freq: 30000
+    threaded: true