Merge remote-tracking branch 'upstream/develop' into develop-flat-code-restructure

7 年前 · d4ca94a1
--- a/MLAgentsSDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
+++ b/MLAgentsSDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
        bool hasMaskedActions; 
        bool hasValueEstimate;
        float[,] inputState;
-        int[] inputPrevAction;
+        int[,] inputPrevAction;
        List<float[,,,]> observationMatrixList;
        float[,] inputOldMemories;
        float[,] maskedActions;
            // Create the state tensor
            if (hasPrevAction)
            {
-                inputPrevAction = new int[currentBatchSize];
+                int totalNumberActions = brain.brainParameters.vectorActionSize.Length;
+                inputPrevAction = new int[currentBatchSize, totalNumberActions];
-                    inputPrevAction[i] = Mathf.FloorToInt(actionList[0]);
+                    for (var j = 0 ; j < totalNumberActions; j++)
+                    {
+                        inputPrevAction[i,j] = Mathf.FloorToInt(actionList[j]);
+                    }
                    i++;
                }
            }
--- a/python/ml-agents/mlagents/trainers/bc/models.py
+++ b/python/ml-agents/mlagents/trainers/bc/models.py
            self.action_probs = tf.concat(
                [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
            self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
-            self.sample_action_float = self.create_discrete_action_masking_layer(
-                policy_branches, self.action_masks, self.a_size)
+            self.sample_action_float, _ = self.create_discrete_action_masking_layer(
+                tf.concat(policy_branches, axis=1), self.action_masks, self.a_size)
            self.sample_action_float = tf.identity(self.sample_action_float, name="action")
            self.sample_action = tf.cast(self.sample_action_float, tf.int32)
            self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")
--- a/python/ml-agents/mlagents/trainers/models.py
+++ b/python/ml-agents/mlagents/trainers/models.py
        return hidden_flat

    @staticmethod
-    def create_discrete_action_masking_layer(branches_logits, action_masks, action_size):
+    def create_discrete_action_masking_layer(all_logits, action_masks, action_size):
-        :param branches_logits: A list of the unnormalized action probabilities fir each branch
+        :param all_logits: The concatenated unnormalized action probabilities for all branches
-        :return: The action output dimension [batch_size, num_branches]
+        :return: The action output dimension [batch_size, num_branches] and the concatenated normalized logits
+        branches_logits = [all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size))]
+
-        raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k])
+        raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k]) + (1-branch_masks[k])*1.0e-10
-        return output
+        return output, tf.concat([tf.log(normalized_probs[k]) for k in range(len(action_size))], axis=1)

    def create_observation_streams(self, num_streams, h_size, num_layers):
        """
        self.all_log_probs = tf.concat([branch for branch in policy_branches], axis=1, name="action_probs")

        self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
-        output = self.create_discrete_action_masking_layer(policy_branches, self.action_masks, self.a_size)
+        output, normalized_logits = self.create_discrete_action_masking_layer(
+            self.all_log_probs, self.action_masks, self.a_size)

        self.output = tf.identity(output, name="action")

-        self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder")
+        self.action_holder = tf.placeholder(
+            shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder")
-        self.all_old_log_probs = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities')
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities')
+        _, old_normalized_logits = self.create_discrete_action_masking_layer(
+            self.all_old_log_probs, self.action_masks, self.a_size)

        action_idx = [0] + list(np.cumsum(self.a_size))

        self.log_probs = tf.reduce_sum((tf.stack([
            -tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=self.selected_actions[:, action_idx[i]:action_idx[i + 1]],
-                logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]
+                logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]]
-                logits=self.all_old_log_probs[:, action_idx[i]:action_idx[i + 1]]
+                logits=old_normalized_logits[:, action_idx[i]:action_idx[i + 1]]
            )
            for i in range(len(self.a_size))], axis=1)), axis=1, keepdims=True)