Return list instead of np array for make_mini_batch() (#2371)

Return list instead of np array for make_mini_batch() to reduce time copying data
6 年前 · d7ebaae1
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py
                sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
                [[a,b],[b,c],[c,d],[d,e]]
                """
-                if training_length == 1:
-                    # When the training length is 1, the method returns a list of elements,
-                    # not a list of sequences of elements.
+                if sequential:
+                    # The sequences will not have overlapping elements (this involves padding)
+                    leftover = len(self) % training_length
+                    # leftover is the number of elements in the first sequence (this sequence might need 0 padding)
-                        # If batch_size is None : All the elements of the AgentBufferField are returned.
-                        return np.array(self)
+                        # retrieve the maximum number of elements
+                        batch_size = len(self) // training_length + 1 * (leftover != 0)
+                    # The maximum number of sequences taken from a list of length len(self) without overlapping
+                    # with padding is equal to batch_size
+                    if batch_size > (
+                        len(self) // training_length + 1 * (leftover != 0)
+                    ):
+                        raise BufferException(
+                            "The batch size and training length requested for get_batch where"
+                            " too large given the current number of data points."
+                        )
+                    if batch_size * training_length > len(self):
+                        padding = np.array(self[-1]) * self.padding_value
+                        return np.array(
+                            [padding] * (training_length - leftover) + self[:]
+                        )
-                        # return the batch_size last elements
-                        if batch_size > len(self):
-                            raise BufferException("Batch size requested is too large")
-                        return np.array(self[-batch_size:])
+                        return np.array(
+                            self[len(self) - batch_size * training_length :]
+                        )
-                    # The training_length is not None, the method returns a list of SEQUENCES of elements
-                    if not sequential:
-                        # The sequences will have overlapping elements
-                        if batch_size is None:
-                            # retrieve the maximum number of elements
-                            batch_size = len(self) - training_length + 1
-                        # The number of sequences of length training_length taken from a list of len(self) elements
-                        # with overlapping is equal to batch_size
-                        if (len(self) - training_length + 1) < batch_size:
-                            raise BufferException(
-                                "The batch size and training length requested for get_batch where"
-                                " too large given the current number of data points."
-                            )
-                        tmp_list = []
-                        for end in range(len(self) - batch_size + 1, len(self) + 1):
-                            tmp_list += [np.array(self[end - training_length : end])]
-                        return np.array(tmp_list)
-                    if sequential:
-                        # The sequences will not have overlapping elements (this involves padding)
-                        leftover = len(self) % training_length
-                        # leftover is the number of elements in the first sequence (this sequence might need 0 padding)
-                        if batch_size is None:
-                            # retrieve the maximum number of elements
-                            batch_size = len(self) // training_length + 1 * (
-                                leftover != 0
-                            )
-                        # The maximum number of sequences taken from a list of length len(self) without overlapping
-                        # with padding is equal to batch_size
-                        if batch_size > (
-                            len(self) // training_length + 1 * (leftover != 0)
-                        ):
-                            raise BufferException(
-                                "The batch size and training length requested for get_batch where"
-                                " too large given the current number of data points."
-                            )
-                        tmp_list = []
-                        padding = np.array(self[-1]) * self.padding_value
-                        # The padding is made with zeros and its shape is given by the shape of the last element
-                        for end in range(
-                            len(self), len(self) % training_length, -training_length
-                        )[:batch_size]:
-                            tmp_list += [np.array(self[end - training_length : end])]
-                        if (leftover != 0) and (len(tmp_list) < batch_size):
-                            tmp_list += [
-                                np.array(
-                                    [padding] * (training_length - leftover)
-                                    + self[:leftover]
-                                )
-                            ]
-                        tmp_list.reverse()
-                        return np.array(tmp_list)
+                    # The sequences will have overlapping elements
+                    if batch_size is None:
+                        # retrieve the maximum number of elements
+                        batch_size = len(self) - training_length + 1
+                    # The number of sequences of length training_length taken from a list of len(self) elements
+                    # with overlapping is equal to batch_size
+                    if (len(self) - training_length + 1) < batch_size:
+                        raise BufferException(
+                            "The batch size and training length requested for get_batch where"
+                            " too large given the current number of data points."
+                        )
+                    tmp_list = []
+                    for end in range(len(self) - batch_size + 1, len(self) + 1):
+                        tmp_list += self[end - training_length : end]
+                    return np.array(tmp_list)

            def reset_field(self):
                """
                length = len(self[key])
            return True

-        def shuffle(self, key_list=None):
+        def shuffle(self, sequence_length, key_list=None):
-            Shuffles the fields in key_list in a consistent way: The reordering will
            Shuffles the fields in key_list in a consistent way: The reordering will
            be the same across fields.
            :param key_list: The fields that must be shuffled.
                raise BufferException(
                    "Unable to shuffle if the fields are not of same length"
                )
-            s = np.arange(len(self[key_list[0]]))
+            s = np.arange(len(self[key_list[0]]) // sequence_length)
-                self[key][:] = [self[key][i] for i in s]
+                tmp = []
+                for i in s:
+                    tmp += self[key][i * sequence_length : (i + 1) * sequence_length]
+                self[key][:] = tmp

        def make_mini_batch(self, start, end):
            """
            """
            mini_batch = {}
            for key in self:
-                mini_batch[key] = np.array(self[key][start:end])
+                mini_batch[key] = self[key][start:end]
            return mini_batch

    def __init__(self):
--- a/ml-agents/mlagents/trainers/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/components/bc/module.py

        n_epoch = self.num_epoch
        for _ in range(n_epoch):
-            self.demonstration_buffer.update_buffer.shuffle()
+            self.demonstration_buffer.update_buffer.shuffle(
+                sequence_length=self.policy.sequence_length
+            )
-            for i in range(num_batches):
+            for i in range(num_batches // self.policy.sequence_length):
-                start = i * self.n_sequences
-                end = (i + 1) * self.n_sequences
+                start = i * self.n_sequences * self.policy.sequence_length
+                end = (i + 1) * self.n_sequences * self.policy.sequence_length
                mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
                run_out = self._update_batch(mini_batch_demo, self.n_sequences)
                loss = run_out["loss"]
            self.policy.model.batch_size: n_sequences,
            self.policy.model.sequence_length: self.policy.sequence_length,
        }
+        feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
-            feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
-                [-1, self.policy.model.brain.vector_action_space_size[0]]
-            )
-            feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
-                [-1, len(self.policy.model.brain.vector_action_space_size)]
-            )
            feed_dict[self.policy.model.action_masks] = np.ones(
                (
                    self.n_sequences,
        if self.policy.model.brain.vector_observation_space_size > 0:
-            apparent_obs_size = (
-                self.policy.model.brain.vector_observation_space_size
-                * self.policy.model.brain.num_stacked_vector_observations
-            )
-            feed_dict[self.policy.model.vector_in] = mini_batch_demo[
-                "vector_obs"
-            ].reshape([-1, apparent_obs_size])
+            feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]
-            visual_obs = mini_batch_demo["visual_obs%d" % i]
-            if self.policy.sequence_length > 1 and self.policy.use_recurrent:
-                (_batch, _seq, _w, _h, _c) = visual_obs.shape
-                feed_dict[self.policy.model.visual_in[i]] = visual_obs.reshape(
-                    [-1, _w, _h, _c]
-                )
-            else:
-                feed_dict[self.policy.model.visual_in[i]] = visual_obs
+            feed_dict[self.policy.model.visual_in[i]] = mini_batch_demo[
+                "visual_obs%d" % i
+            ]
        if self.use_recurrent:
            feed_dict[self.policy.model.memory_in] = np.zeros(
                [self.n_sequences, self.policy.m_size]
                    "prev_action"
-                ].reshape([-1, len(self.policy.model.act_size)])
+                ]

        network_out = self.policy.sess.run(
            list(self.out_dict.values()), feed_dict=feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
        forward_total: List[float] = []
        inverse_total: List[float] = []
        for _ in range(self.num_epoch):
-            update_buffer.shuffle()
+            update_buffer.shuffle(sequence_length=self.policy.sequence_length)
            buffer = update_buffer
            for l in range(len(update_buffer["actions"]) // num_sequences):
                start = l * num_sequences
        feed_dict = {
            self.policy.model.batch_size: num_sequences,
            self.policy.model.sequence_length: self.policy.sequence_length,
-            self.policy.model.mask_input: mini_batch["masks"].flatten(),
-            self.policy.model.advantage: mini_batch["advantages"].reshape([-1, 1]),
-            self.policy.model.all_old_log_probs: mini_batch["action_probs"].reshape(
-                [-1, sum(self.policy.model.act_size)]
-            ),
+            self.policy.model.mask_input: mini_batch["masks"],
+            self.policy.model.advantage: mini_batch["advantages"],
+            self.policy.model.all_old_log_probs: mini_batch["action_probs"],
-            feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"].reshape(
-                [-1, self.policy.model.act_size[0]]
-            )
+            feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
-            feed_dict[self.policy.model.action_holder] = mini_batch["actions"].reshape(
-                [-1, len(self.policy.model.act_size)]
-            )
+            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
-            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"].reshape(
-                [-1, self.policy.vec_obs_size]
-            )
-            feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"].reshape(
-                [-1, self.policy.vec_obs_size]
-            )
+            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
-                _obs = mini_batch["visual_obs%d" % i]
-                if self.policy.sequence_length > 1 and self.policy.use_recurrent:
-                    (_batch, _seq, _w, _h, _c) = _obs.shape
-                    feed_dict[self.policy.model.visual_in[i]] = _obs.reshape(
-                        [-1, _w, _h, _c]
-                    )
-                else:
-                    feed_dict[self.policy.model.visual_in[i]] = _obs
+                feed_dict[self.policy.model.visual_in[i]] = mini_batch[
+                    "visual_obs%d" % i
+                ]
-                _obs = mini_batch["next_visual_obs%d" % i]
-                if self.policy.sequence_length > 1 and self.policy.use_recurrent:
-                    (_batch, _seq, _w, _h, _c) = _obs.shape
-                    feed_dict[self.model.next_visual_in[i]] = _obs.reshape(
-                        [-1, _w, _h, _c]
-                    )
-                else:
-                    feed_dict[self.model.next_visual_in[i]] = _obs
+                feed_dict[self.model.next_visual_in[i]] = mini_batch[
+                    "next_visual_obs%d" % i
+                ]

        self.has_updated = True
        run_out = self.policy._execute_model(feed_dict, self.update_dict)
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
        """
        Creates the input layers for the discriminator
        """
-        self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
-        self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
+        self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
+        self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
+        self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
+        self.done_policy = tf.expand_dims(self.done_policy_holder, -1)

        if self.policy_model.brain.vector_action_space_type == "continuous":
            action_length = self.policy_model.act_size[0]
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py

        n_epoch = self.num_epoch
        for _epoch in range(n_epoch):
-            self.demonstration_buffer.update_buffer.shuffle()
-            update_buffer.shuffle()
+            self.demonstration_buffer.update_buffer.shuffle(
+                sequence_length=self.policy.sequence_length
+            )
+            update_buffer.shuffle(sequence_length=self.policy.sequence_length)
            if max_batches == 0:
                num_batches = possible_batches
            else:
        :return: Output from update process.
        """
        feed_dict: Dict[tf.Tensor, Any] = {
-            self.model.done_expert: mini_batch_demo["done"].reshape([-1, 1]),
-            self.model.done_policy: mini_batch_policy["done"].reshape([-1, 1]),
+            self.model.done_expert_holder: mini_batch_demo["done"],
+            self.model.done_policy_holder: mini_batch_policy["done"],
+        feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
-            feed_dict[self.policy.model.selected_actions] = mini_batch_policy[
-                "actions"
-            ].reshape([-1, self.policy.model.act_size[0]])
-            feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
-                [-1, self.policy.model.act_size[0]]
-            )
+            feed_dict[self.policy.model.selected_actions] = mini_batch_policy["actions"]
-            feed_dict[self.policy.model.action_holder] = mini_batch_policy[
-                "actions"
-            ].reshape([-1, len(self.policy.model.act_size)])
-            feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
-                [-1, len(self.policy.model.act_size)]
-            )
+            feed_dict[self.policy.model.action_holder] = mini_batch_policy["actions"]
-                policy_obs = mini_batch_policy["visual_obs%d" % i]
-                if self.policy.sequence_length > 1 and self.policy.use_recurrent:
-                    (_batch, _seq, _w, _h, _c) = policy_obs.shape
-                    feed_dict[self.policy.model.visual_in[i]] = policy_obs.reshape(
-                        [-1, _w, _h, _c]
-                    )
-                else:
-                    feed_dict[self.policy.model.visual_in[i]] = policy_obs
-
-                demo_obs = mini_batch_demo["visual_obs%d" % i]
-                if self.policy.sequence_length > 1 and self.policy.use_recurrent:
-                    (_batch, _seq, _w, _h, _c) = demo_obs.shape
-                    feed_dict[self.model.expert_visual_in[i]] = demo_obs.reshape(
-                        [-1, _w, _h, _c]
-                    )
-                else:
-                    feed_dict[self.model.expert_visual_in[i]] = demo_obs
+                feed_dict[self.policy.model.visual_in[i]] = mini_batch_policy[
+                    "visual_obs%d" % i
+                ]
+                feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
+                    "visual_obs%d" % i
+                ]
-            feed_dict[self.policy.model.vector_in] = mini_batch_policy[
-                "vector_obs"
-            ].reshape([-1, self.policy.vec_obs_size])
-            feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"].reshape(
-                [-1, self.policy.vec_obs_size]
-            )
+            feed_dict[self.policy.model.vector_in] = mini_batch_policy["vector_obs"]
+            feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]

        out_dict = {
            "gail_loss": self.model.loss,
--- a/ml-agents/mlagents/trainers/ppo/models.py
+++ b/ml-agents/mlagents/trainers/ppo/models.py
            self.returns_holders[name] = returns_holder
            self.old_values[name] = old_value
        self.advantage = tf.placeholder(
-            shape=[None, 1], dtype=tf.float32, name="advantages"
+            shape=[None], dtype=tf.float32, name="advantages"
+        advantage = tf.expand_dims(self.advantage, -1)
        self.learning_rate = tf.train.polynomial_decay(
            lr, self.global_step, max_step, 1e-10, power=1.0
        )
        self.value_loss = tf.reduce_mean(value_losses)

        r_theta = tf.exp(probs - old_probs)
-        p_opt_a = r_theta * self.advantage
+        p_opt_a = r_theta * advantage
-            * self.advantage
+            * advantage
        )
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py

    def construct_feed_dict(self, model, mini_batch, num_sequences):
        feed_dict = {
-            model.batch_size: num_sequences,
-            model.sequence_length: self.sequence_length,
-            model.mask_input: mini_batch["masks"].flatten(),
-            model.advantage: mini_batch["advantages"].reshape([-1, 1]),
-            model.all_old_log_probs: mini_batch["action_probs"].reshape(
-                [-1, sum(model.act_size)]
-            ),
+            self.model.batch_size: num_sequences,
+            self.model.sequence_length: self.sequence_length,
+            self.model.mask_input: mini_batch["masks"],
+            self.model.advantage: mini_batch["advantages"],
+            self.model.all_old_log_probs: mini_batch["action_probs"],
-            ].flatten()
+            ]
-            ].flatten()
+            ]
-            feed_dict[model.output_pre] = mini_batch["actions_pre"].reshape(
-                [-1, model.act_size[0]]
-            )
-            feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"].reshape(
-                [-1, model.act_size[0]]
-            )
+
+            feed_dict[model.output_pre] = mini_batch["actions_pre"]
+            feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
-            feed_dict[model.action_holder] = mini_batch["actions"].reshape(
-                [-1, len(model.act_size)]
-            )
+            feed_dict[model.action_holder] = mini_batch["actions"]
-                feed_dict[model.prev_action] = mini_batch["prev_action"].reshape(
-                    [-1, len(model.act_size)]
-                )
-            feed_dict[model.action_masks] = mini_batch["action_mask"].reshape(
-                [-1, sum(self.brain.vector_action_space_size)]
-            )
+                feed_dict[model.prev_action] = mini_batch["prev_action"]
+            feed_dict[model.action_masks] = mini_batch["action_mask"]
-            feed_dict[model.vector_in] = mini_batch["vector_obs"].reshape(
-                [-1, self.vec_obs_size]
-            )
-        if model.vis_obs_size > 0:
-            for i, _ in enumerate(model.visual_in):
-                _obs = mini_batch["visual_obs%d" % i]
-                if self.sequence_length > 1 and self.use_recurrent:
-                    (_batch, _seq, _w, _h, _c) = _obs.shape
-                    feed_dict[model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c])
-                else:
-                    feed_dict[model.visual_in[i]] = _obs
+            feed_dict[model.vector_in] = mini_batch["vector_obs"]
+        if self.model.vis_obs_size > 0:
+            for i, _ in enumerate(self.model.visual_in):
+                feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
-            mem_in = mini_batch["memory"][:, 0, :]
+            mem_in = [
+                mini_batch["memory"][i]
+                for i in range(0, len(mini_batch["memory"]), self.sequence_length)
+            ]
            feed_dict[model.memory_in] = mem_in
        return feed_dict

                brain_info.memories = self.make_empty_memory(len(brain_info.agents))
            feed_dict[self.model.memory_in] = [brain_info.memories[idx]]
        if not self.use_continuous_act and self.use_recurrent:
-            feed_dict[self.model.prev_action] = brain_info.previous_vector_actions[
-                idx
-            ].reshape([-1, len(self.model.act_size)])
+            feed_dict[self.model.prev_action] = [
+                brain_info.previous_vector_actions[idx]
+            ]
        value_estimates = self.sess.run(self.model.value_heads, feed_dict)

        value_estimates = {k: float(v) for k, v in value_estimates.items()}
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        :return: A boolean corresponding to whether or not update_model() can be run
        """
        size_of_buffer = len(self.training_buffer.update_buffer["actions"])
-        return size_of_buffer > max(
-            int(self.trainer_parameters["buffer_size"] / self.policy.sequence_length), 1
-        )
+        return size_of_buffer > self.trainer_parameters["buffer_size"]

    def update_policy(self):
        """
            mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
        )
        self.cumulative_returns_since_policy_update = []
+        batch_size = self.trainer_parameters["batch_size"]
        n_sequences = max(
            int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
        )
        )
        num_epoch = self.trainer_parameters["num_epoch"]
        for _ in range(num_epoch):
-            self.training_buffer.update_buffer.shuffle()
+            self.training_buffer.update_buffer.shuffle(
+                sequence_length=self.policy.sequence_length
+            )
-                len(self.training_buffer.update_buffer["actions"]) // n_sequences
+                0, len(self.training_buffer.update_buffer["actions"]), batch_size
-                start = l * n_sequences
-                end = (l + 1) * n_sequences
-                    buffer.make_mini_batch(start, end), n_sequences
+                    buffer.make_mini_batch(l, l + batch_size), n_sequences
                )
                value_total.append(run_out["value_loss"])
                policy_total.append(np.abs(run_out["policy_loss"]))
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
    num_vis_observations=0,
    num_vector_acts=2,
    discrete=False,
+    num_discrete_branches=1,
 ):
    """
    Creates a mock BrainInfo with observations. Imitates constant
    )
    if discrete:
        mock_braininfo.return_value.previous_vector_actions = np.array(
-            num_agents * [1 * [0.5]]
+            num_agents * [num_discrete_branches * [0.5]]
        )
        mock_braininfo.return_value.action_masks = np.array(
            num_agents * [num_vector_acts * [1.0]]
--- a/ml-agents/mlagents/trainers/tests/test_buffer.py
+++ b/ml-agents/mlagents/trainers/tests/test_buffer.py
    a = b[1]["vector_observation"].get_batch(
        batch_size=2, training_length=1, sequential=True
    )
-    assert_array(a, np.array([[171, 172, 173], [181, 182, 183]]))
+    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
-        a,
+        np.array(a),
-                [[231, 232, 233], [241, 242, 243], [251, 252, 253]],
-                [[261, 262, 263], [271, 272, 273], [281, 282, 283]],
+                [231, 232, 233],
+                [241, 242, 243],
+                [251, 252, 253],
+                [261, 262, 263],
+                [271, 272, 273],
+                [281, 282, 283],
            ]
        ),
    )
    assert_array(
-        a,
+        np.array(a),
-                [[251, 252, 253], [261, 262, 263], [271, 272, 273]],
-                [[261, 262, 263], [271, 272, 273], [281, 282, 283]],
+                [251, 252, 253],
+                [261, 262, 263],
+                [271, 272, 273],
+                [261, 262, 263],
+                [271, 272, 273],
+                [281, 282, 283],
            ]
        ),
    )
    b.append_update_buffer(2, batch_size=None, training_length=2)
-    assert len(b.update_buffer["action"]) == 10
-    assert np.array(b.update_buffer["action"]).shape == (10, 2, 2)
+    assert len(b.update_buffer["action"]) == 20
+    assert np.array(b.update_buffer["action"]).shape == (20, 2)
-    assert c["action"].shape == (1, 2, 2)
+    assert np.array(c["action"]).shape == (1, 2)
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py

 VECTOR_ACTION_SPACE = [2]
 VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [2]
+DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
 BUFFER_INIT_SAMPLES = 20
 NUM_AGENTS = 12

                DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE
            ),
            discrete=use_discrete,
+            num_discrete_branches=len(DISCRETE_ACTION_SPACE),
        )
    else:
        mock_brain = mb.create_mock_brainparams(
                DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE
            ),
            discrete=use_discrete,
+            num_discrete_branches=len(DISCRETE_ACTION_SPACE),
        )
    mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
    env = mock_env()


@mock.patch("mlagents.envs.UnityEnvironment")
-def test_gail_dc(mock_env, dummy_config, gail_dummy_config):
-    env, policy = create_ppo_policy_mock(
-        mock_env, dummy_config, gail_dummy_config, False, True, False
-    )
-    reward_signal_eval(env, policy, "gail")
-    reward_signal_update(env, policy, "gail")
-
-
-@mock.patch("mlagents.envs.UnityEnvironment")
-def test_gail_visual(mock_env, dummy_config, gail_dummy_config):
+def test_gail_dc_visual(mock_env, dummy_config, gail_dummy_config):
    gail_dummy_config["gail"]["demo_path"] = (
        os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
    )