[change] Remove concatenate in discrete action probabilities to improve inference performance (#3598)

5 年前 · 94de596b
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 - `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
 - The `IFloatProperties` interface has been removed.
 - Fix #3579.
+ - Improved inference performance for models with multiple action branches. (#3598)
 - Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)
 - The interfaces to the `SideChannel` classes (on C# and python) have changed to use new  `IncomingMessage` and `OutgoingMessage` classes. These should make reading and writing data to the channel easier. (#3596)

--- a/ml-agents/mlagents/trainers/distributions.py
+++ b/ml-agents/mlagents/trainers/distributions.py
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            )
-        unmasked_log_probs = tf.concat(policy_branches, axis=1)
-        return unmasked_log_probs
+        return policy_branches
-        unmasked_log_probs: tf.Tensor,
+        unmasked_log_probs: List[tf.Tensor],
        act_size: List[int],
        action_masks: tf.Tensor,
    ) -> Tuple[tf.Tensor, tf.Tensor, np.ndarray]:
--- a/ml-agents/mlagents/trainers/models.py
+++ b/ml-agents/mlagents/trainers/models.py
        )

    @staticmethod
-    def create_discrete_action_masking_layer(all_logits, action_masks, action_size):
+    def break_into_branches(
+        concatenated_logits: tf.Tensor, action_size: List[int]
+    ) -> List[tf.Tensor]:
+        """
+        Takes a concatenated set of logits that represent multiple discrete action branches
+        and breaks it up into one Tensor per branch.
+        :param concatenated_logits: Tensor that represents the concatenated action branches
+        :param action_size: List of ints containing the number of possible actions for each branch.
+        :return: A List of Tensors containing one tensor per branch.
+        """
+        action_idx = [0] + list(np.cumsum(action_size))
+        branched_logits = [
+            concatenated_logits[:, action_idx[i] : action_idx[i + 1]]
+            for i in range(len(action_size))
+        ]
+        return branched_logits
+
+    @staticmethod
+    def create_discrete_action_masking_layer(
+        branches_logits: List[tf.Tensor],
+        action_masks: tf.Tensor,
+        action_size: List[int],
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        :param all_logits: The concatenated unnormalized action probabilities for all branches
+        :param branches_logits: A List of the unnormalized action probabilities for each branch
        :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
        :param action_size: A list containing the number of possible actions for each branch
        :return: The action output dimension [batch_size, num_branches], the concatenated
-        action_idx = [0] + list(np.cumsum(action_size))
-        branches_logits = [
-            all_logits[:, action_idx[i] : action_idx[i + 1]]
-            for i in range(len(action_size))
-        ]
-        branch_masks = [
-            action_masks[:, action_idx[i] : action_idx[i + 1]]
-            for i in range(len(action_size))
-        ]
+        branch_masks = ModelUtils.break_into_branches(action_masks, action_size)
        raw_probs = [
            tf.multiply(tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
            for k in range(len(action_size))
--- a/ml-agents/mlagents/trainers/ppo/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo/optimizer.py
            dtype=tf.float32,
            name="old_probabilities",
        )
+
+        # Break old log probs into separate branches
+        old_log_prob_branches = ModelUtils.break_into_branches(
+            self.all_old_log_probs, self.policy.act_size
+        )
+
-            self.all_old_log_probs, self.policy.action_masks, self.policy.act_size
+            old_log_prob_branches, self.policy.action_masks, self.policy.act_size
        )

        action_idx = [0] + list(np.cumsum(self.policy.act_size))
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py

        for name in stream_names:
            if discrete:
-                _branched_mpq1 = self._apply_as_branches(
-                    self.policy_network.q1_pheads[name] * discrete_action_probs
+                _branched_mpq1 = ModelUtils.break_into_branches(
+                    self.policy_network.q1_pheads[name] * discrete_action_probs,
+                    self.act_size,
                )
                branched_mpq1 = tf.stack(
                    [
                )
                _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)

-                _branched_mpq2 = self._apply_as_branches(
-                    self.policy_network.q2_pheads[name] * discrete_action_probs
+                _branched_mpq2 = ModelUtils.break_into_branches(
+                    self.policy_network.q2_pheads[name] * discrete_action_probs,
+                    self.act_size,
                )
                branched_mpq2 = tf.stack(
                    [

            if discrete:
                # We need to break up the Q functions by branch, and update them individually.
-                branched_q1_stream = self._apply_as_branches(
-                    self.policy.selected_actions * q1_streams[name]
+                branched_q1_stream = ModelUtils.break_into_branches(
+                    self.policy.selected_actions * q1_streams[name], self.act_size
-                branched_q2_stream = self._apply_as_branches(
-                    self.policy.selected_actions * q2_streams[name]
+                branched_q2_stream = ModelUtils.break_into_branches(
+                    self.policy.selected_actions * q2_streams[name], self.act_size
                )

                # Reduce each branch into scalar
        self.ent_coef = tf.exp(self.log_ent_coef)
        if discrete:
            # We also have to do a different entropy and target_entropy per branch.
-            branched_per_action_ent = self._apply_as_branches(per_action_entropy)
+            branched_per_action_ent = ModelUtils.break_into_branches(
+                per_action_entropy, self.act_size
+            )
            branched_ent_sums = tf.stack(
                [
                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
            # Same with policy loss, we have to do the loss per branch and average them,
            # so that larger branches don't get more weight.
            # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
-            branched_q_term = self._apply_as_branches(
-                discrete_action_probs * self.policy_network.q1_p
+            branched_q_term = ModelUtils.break_into_branches(
+                discrete_action_probs * self.policy_network.q1_p, self.act_size
            )

            branched_policy_loss = tf.stack(
        self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss

        self.entropy = self.policy_network.entropy
-
-    def _apply_as_branches(self, concat_logits: tf.Tensor) -> List[tf.Tensor]:
-        """
-        Takes in a concatenated set of logits and breaks it up into a list of non-concatenated logits, one per
-        action branch
-        """
-        action_idx = [0] + list(np.cumsum(self.act_size))
-        branches_logits = [
-            concat_logits[:, action_idx[i] : action_idx[i + 1]]
-            for i in range(len(self.act_size))
-        ]
-        return branches_logits

    def _create_sac_optimizer_ops(self) -> None:
        """