Update reward signals in parallel with policy (#2362)

6 年前 · bd7eb286
--- a/docs/Reward-Signals.md
+++ b/docs/Reward-Signals.md

 ### Curiosity Reward Signal

-The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation
+The `curiosity` Reward Signal enables the Intrinsic Curiosity Module. This is an implementation
 of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction"
 by Pathak, et al. It trains two networks:
 * an inverse model, which takes the current and next obersvation of the agent, encodes them, and
 Default Value: `3e-4`

 Typical Range: `1e-5` - `1e-3`
-
-#### (Optional) Num Epochs
-
-`num_epoch` The number of passes to make through the experience buffer when performing gradient
-descent optimization for the ICM. This typically should be set to the same as used for PPO.
-
-Default Value: `3`
-
-Typical Range: `3` - `10`

 ### GAIL Reward Signal

 unstable, or unable to learn the task at hand.

 Default Value: `false`
-
-#### (Optional) Samples Per Update
-
-`samples_per_update` is the maximum number of samples to use during each discriminator update. You may
-want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data.
-If set to 0, we will use the minimum of buffer size and the number of demonstration samples.
-
-Default Value: `0`
-
-Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md)
-
-#### (Optional) Num Epochs
-
-`num_epoch` The number of passes to make through the experience buffer when performing gradient
-descent optimization for the discriminator. To avoid overfitting, this typically should be set to
-the same as or less than used for PPO.
-
-Default Value: `3`
-
-Typical Range: `1` - `10`
--- a/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
 import numpy as np
 from mlagents.envs.brain import BrainInfo

+import tensorflow as tf
+
+from mlagents.trainers.models import LearningModel


 class CuriosityRewardSignal(RewardSignal):
+        policy_model: LearningModel,
-        num_epoch: int = 3,
    ):
        """
        Creates the Curiosity reward generator
        :param gamma: The time discounting factor used for this reward.
        :param encoding_size: The size of the hidden encoding layer for the ICM
        :param learning_rate: The learning rate for the ICM.
-        :param num_epoch: The number of epochs to train over the training buffer for the ICM.
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, policy_model, strength, gamma)
-            policy.model, encoding_size=encoding_size, learning_rate=learning_rate
+            policy_model, encoding_size=encoding_size, learning_rate=learning_rate
-        self.num_epoch = num_epoch
-            "forward_loss": self.model.forward_loss,
-            "inverse_loss": self.model.inverse_loss,
-            "update": self.model.update_batch,
+            "curiosity_forward_loss": self.model.forward_loss,
+            "curiosity_inverse_loss": self.model.inverse_loss,
+            "curiosity_update": self.model.update_batch,
+        }
+        self.stats_name_to_update_name = {
+            "Losses/Curiosity Forward Loss": "curiosity_forward_loss",
+            "Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
        }
        self.has_updated = False

        param_keys = ["strength", "gamma", "encoding_size"]
        super().check_config(config_dict, param_keys)

-    def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
-        """
-        Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs
-        gradient descent.
-        :param update_buffer: Update buffer from which to pull data from.
-        :param num_sequences: Number of sequences in the update buffer.
-        :return: Dict of stats that should be reported to Tensorboard.
+    def prepare_update(
+        self,
+        policy_model: LearningModel,
+        mini_batch: Dict[str, np.ndarray],
+        num_sequences: int,
+    ) -> Dict[tf.Tensor, Any]:
-        forward_total: List[float] = []
-        inverse_total: List[float] = []
-        for _ in range(self.num_epoch):
-            update_buffer.shuffle(sequence_length=self.policy.sequence_length)
-            buffer = update_buffer
-            for l in range(len(update_buffer["actions"]) // num_sequences):
-                start = l * num_sequences
-                end = (l + 1) * num_sequences
-                run_out_curio = self._update_batch(
-                    buffer.make_mini_batch(start, end), num_sequences
-                )
-                inverse_total.append(run_out_curio["inverse_loss"])
-                forward_total.append(run_out_curio["forward_loss"])
-
-        update_stats = {
-            "Losses/Curiosity Forward Loss": np.mean(forward_total),
-            "Losses/Curiosity Inverse Loss": np.mean(inverse_total),
-        }
-        return update_stats
-
-    def _update_batch(
-        self, mini_batch: Dict[str, np.ndarray], num_sequences: int
-    ) -> Dict[str, float]:
-        """
-        Updates model using buffer.
+        Prepare for update and get feed_dict.
-        :return: Output from update process.
+        :return: Feed_dict needed for update.
-            self.policy.model.batch_size: num_sequences,
-            self.policy.model.sequence_length: self.policy.sequence_length,
-            self.policy.model.mask_input: mini_batch["masks"],
-            self.policy.model.advantage: mini_batch["advantages"],
-            self.policy.model.all_old_log_probs: mini_batch["action_probs"],
+            policy_model.batch_size: num_sequences,
+            policy_model.sequence_length: self.policy.sequence_length,
+            policy_model.mask_input: mini_batch["masks"],
+            policy_model.advantage: mini_batch["advantages"],
+            policy_model.all_old_log_probs: mini_batch["action_probs"],
-            feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
+            feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]
-            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
+            feed_dict[policy_model.action_holder] = mini_batch["actions"]
-            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
-        if self.policy.model.vis_obs_size > 0:
-            for i, _ in enumerate(self.policy.model.visual_in):
-                feed_dict[self.policy.model.visual_in[i]] = mini_batch[
-                    "visual_obs%d" % i
-                ]
-            for i, _ in enumerate(self.policy.model.visual_in):
+        if policy_model.vis_obs_size > 0:
+            for i, _ in enumerate(policy_model.visual_in):
+                feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
+            for i, _ in enumerate(policy_model.visual_in):
-        run_out = self.policy._execute_model(feed_dict, self.update_dict)
-        return run_out
+        return feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
 from mlagents.trainers.buffer import Buffer
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel
-    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
+    def __init__(
+        self,
+        policy: TFPolicy,
+        policy_model: LearningModel,
+        strength: float,
+        gamma: float,
+    ):
        """
        The extrinsic reward generator. Returns the reward received by the environment
        :param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.
        """
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, policy_model, strength, gamma)

    @classmethod
    def check_config(
        unscaled_reward = np.array(next_info.rewards)
        scaled_reward = self.strength * unscaled_reward
        return RewardSignalResult(scaled_reward, unscaled_reward)
-
-    def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
-        """
-        This method does nothing, as there is nothing to update.
-        """
-        return {}
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
        self.gradient_penalty_weight = gradient_penalty_weight
        self.use_vail = use_vail
        self.use_actions = use_actions  # True # Not using actions
-        self.make_beta()
+        if self.use_vail:
+            self.make_beta_update()
-    def make_beta(self) -> None:
+    def make_beta_update(self) -> None:
-        self.beta = tf.get_variable(
-            "gail_beta",
-            [],
-            trainable=False,
-            dtype=tf.float32,
-            initializer=tf.ones_initializer(),
-        )
-        self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32)
+
-            self.beta + self.alpha * (self.kl_div_input - self.mutual_information),
-            EPSILON,
+            self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
-        self.update_beta = tf.assign(self.beta, new_beta)
+        with tf.control_dependencies(self.update_batch):
+            self.update_beta = tf.assign(self.beta, new_beta)

    def make_inputs(self) -> None:
        """
        """
        self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
        self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
+
+        if self.use_vail:
+            self.beta = tf.get_variable(
+                "gail_beta",
+                [],
+                trainable=False,
+                dtype=tf.float32,
+                initializer=tf.ones_initializer(),
+            )

        self.discriminator_loss = -tf.reduce_mean(
            tf.log(self.expert_estimate + EPSILON)
--- a/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
 from mlagents.trainers.buffer import Buffer
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel
 from .model import GAILModel
 from mlagents.trainers.demo_loader import demo_to_buffer

    def __init__(
        self,
        policy: TFPolicy,
+        policy_model: LearningModel,
-        num_epoch: int = 3,
-        samples_per_update: int = 0,
        use_actions: bool = False,
        use_vail: bool = False,
    ):
        :param num_epoch: The number of epochs to train over the training buffer for the discriminator.
        :param encoding_size: The size of the the hidden layers of the discriminator
        :param learning_rate: The Learning Rate used during GAIL updates.
-        :param samples_per_update: The maximum number of samples to update during GAIL updates.
-        super().__init__(policy, strength, gamma)
-        self.num_epoch = num_epoch
-        self.samples_per_update = samples_per_update
+        super().__init__(policy, policy_model, strength, gamma)
        self.use_terminal_states = False

        self.model = GAILModel(
        self.has_updated = False
+        self.update_dict: Dict[str, tf.Tensor] = {
+            "gail_loss": self.model.loss,
+            "gail_update_batch": self.model.update_batch,
+            "gail_policy_estimate": self.model.policy_estimate,
+            "gail_expert_estimate": self.model.expert_estimate,
+        }
+        if self.model.use_vail:
+            self.update_dict["kl_loss"] = self.model.kl_loss
+            self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
+            self.update_dict["z_mean_expert"] = self.model.z_mean_expert
+            self.update_dict["z_mean_policy"] = self.model.z_mean_policy
+            self.update_dict["beta_update"] = self.model.update_beta
+
+        self.stats_name_to_update_name = {"Losses/GAIL Loss": "gail_loss"}

    def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
        param_keys = ["strength", "gamma", "demo_path"]
        super().check_config(config_dict, param_keys)

-    def update(self, update_buffer: Buffer, n_sequences: int) -> Dict[str, float]:
-        """
-        Updates model using buffer.
-        :param update_buffer: The policy buffer containing the trajectories for the current policy.
-        :param n_sequences: The number of sequences from demo and policy used in each mini batch.
-        :return: The loss of the update.
-        """
-        batch_losses = []
-        # Divide by 2 since we have two buffers, so we have roughly the same batch size
-        n_sequences = max(n_sequences // 2, 1)
-        possible_demo_batches = (
-            len(self.demonstration_buffer.update_buffer["actions"]) // n_sequences
-        )
-        possible_policy_batches = len(update_buffer["actions"]) // n_sequences
-        possible_batches = min(possible_policy_batches, possible_demo_batches)
-
-        max_batches = self.samples_per_update // n_sequences
-
-        kl_loss = []
-        policy_estimate = []
-        expert_estimate = []
-        z_log_sigma_sq = []
-        z_mean_expert = []
-        z_mean_policy = []
-
-        n_epoch = self.num_epoch
-        for _epoch in range(n_epoch):
-            self.demonstration_buffer.update_buffer.shuffle(
-                sequence_length=self.policy.sequence_length
-            )
-            update_buffer.shuffle(sequence_length=self.policy.sequence_length)
-            if max_batches == 0:
-                num_batches = possible_batches
-            else:
-                num_batches = min(possible_batches, max_batches)
-            for i in range(num_batches):
-                demo_update_buffer = self.demonstration_buffer.update_buffer
-                policy_update_buffer = update_buffer
-                start = i * n_sequences
-                end = (i + 1) * n_sequences
-                mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
-                mini_batch_policy = policy_update_buffer.make_mini_batch(start, end)
-                run_out = self._update_batch(mini_batch_demo, mini_batch_policy)
-                loss = run_out["gail_loss"]
-
-                policy_estimate.append(run_out["policy_estimate"])
-                expert_estimate.append(run_out["expert_estimate"])
-                if self.model.use_vail:
-                    kl_loss.append(run_out["kl_loss"])
-                    z_log_sigma_sq.append(run_out["z_log_sigma_sq"])
-                    z_mean_policy.append(run_out["z_mean_policy"])
-                    z_mean_expert.append(run_out["z_mean_expert"])
-
-                batch_losses.append(loss)
-        self.has_updated = True
-
-        print_list = ["n_epoch", "beta", "policy_estimate", "expert_estimate"]
-        print_vals = [
-            n_epoch,
-            self.policy.sess.run(self.model.beta),
-            np.mean(policy_estimate),
-            np.mean(expert_estimate),
-        ]
-        if self.model.use_vail:
-            print_list += [
-                "kl_loss",
-                "z_mean_expert",
-                "z_mean_policy",
-                "z_log_sigma_sq",
-            ]
-            print_vals += [
-                np.mean(kl_loss),
-                np.mean(z_mean_expert),
-                np.mean(z_mean_policy),
-                np.mean(z_log_sigma_sq),
-            ]
-        LOGGER.debug(
-            "GAIL Debug:\n\t\t"
-            + "\n\t\t".join(
-                "{0}: {1}".format(_name, _val)
-                for _name, _val in zip(print_list, print_vals)
-            )
-        )
-        update_stats = {"Losses/GAIL Loss": np.mean(batch_losses)}
-        return update_stats
-
-    def _update_batch(
+    def prepare_update(
-        mini_batch_demo: Dict[str, np.ndarray],
+        policy_model: LearningModel,
-    ) -> Dict[str, float]:
+        num_sequences: int,
+    ) -> Dict[tf.Tensor, Any]:
-        Helper method for update.
+        Prepare inputs for update. .
-        :return: Output from update process.
+        :return: Feed_dict for update process.
+        max_num_experiences = min(
+            len(mini_batch_policy["actions"]),
+            len(self.demonstration_buffer.update_buffer["actions"]),
+        )
+        # If num_sequences is less, we need to shorten the input batch.
+        for key, element in mini_batch_policy.items():
+            mini_batch_policy[key] = element[:max_num_experiences]
+        # Get demo buffer
+        self.demonstration_buffer.update_buffer.shuffle(1)
+        # TODO: Replace with SAC sample method
+        mini_batch_demo = self.demonstration_buffer.update_buffer.make_mini_batch(
+            0, len(mini_batch_policy["actions"])
+        )
+
        feed_dict: Dict[tf.Tensor, Any] = {
            self.model.done_expert_holder: mini_batch_demo["done"],
            self.model.done_policy_holder: mini_batch_policy["done"],

        feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
        if self.policy.use_continuous_act:
-            feed_dict[self.policy.model.selected_actions] = mini_batch_policy["actions"]
+            feed_dict[policy_model.selected_actions] = mini_batch_policy["actions"]
-            feed_dict[self.policy.model.action_holder] = mini_batch_policy["actions"]
+            feed_dict[policy_model.action_holder] = mini_batch_policy["actions"]
-            for i in range(len(self.policy.model.visual_in)):
-                feed_dict[self.policy.model.visual_in[i]] = mini_batch_policy[
+            for i in range(len(policy_model.visual_in)):
+                feed_dict[policy_model.visual_in[i]] = mini_batch_policy[
                    "visual_obs%d" % i
                ]
                feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
-            feed_dict[self.policy.model.vector_in] = mini_batch_policy["vector_obs"]
+            feed_dict[policy_model.vector_in] = mini_batch_policy["vector_obs"]
-
-        out_dict = {
-            "gail_loss": self.model.loss,
-            "update_batch": self.model.update_batch,
-            "policy_estimate": self.model.policy_estimate,
-            "expert_estimate": self.model.expert_estimate,
-        }
-        if self.model.use_vail:
-            out_dict["kl_loss"] = self.model.kl_loss
-            out_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
-            out_dict["z_mean_expert"] = self.model.z_mean_expert
-            out_dict["z_mean_policy"] = self.model.z_mean_policy
-
-        run_out = self.policy.sess.run(out_dict, feed_dict=feed_dict)
-        if self.model.use_vail:
-            self.update_beta(run_out["kl_loss"])
-        return run_out
-
-    def update_beta(self, kl_div: float) -> None:
-        """
-        Updates the Beta parameter with the latest kl_divergence value.
-        The larger Beta, the stronger the importance of the kl divergence in the loss function.
-        :param kl_div: The KL divergence
-        """
-        self.policy.sess.run(
-            self.model.update_beta, feed_dict={self.model.kl_div_input: kl_div}
-        )
+        self.has_updated = True
+        return feed_dict
--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py
 from mlagents.envs.brain import BrainInfo
 from mlagents.trainers.trainer import UnityTrainerException
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel
 from mlagents.trainers.buffer import Buffer

 logger = logging.getLogger("mlagents.trainers")


 class RewardSignal(abc.ABC):
-    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
+    def __init__(
+        self,
+        policy: TFPolicy,
+        policy_model: LearningModel,
+        strength: float,
+        gamma: float,
+    ):
        """
        Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
        the reward strength, and the gamma (discount factor.)
        # Terminate discounted reward computation at Done. Can disable to mitigate positive bias in rewards with
        # no natural end, e.g. GAIL or Curiosity
        self.use_terminal_states = True
+        self.update_dict: Dict[str, tf.Tensor] = {}
+        self.policy_model = policy_model
+        self.stats_name_to_update_name: Dict[str, str] = {}

    def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
            np.zeros(len(current_info.agents)),
        )

-    def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
+    def prepare_update(
+        self,
+        policy_model: LearningModel,
+        mini_batch: Dict[str, np.ndarray],
+        num_sequences: int,
+    ) -> Dict[tf.Tensor, Any]:
-        If the reward signal has an internal model (e.g. GAIL or Curiosity), update that model.
+        If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
+        needed to update the buffer..
-        :return: A dict of {"Stat Name": stat} to be added to Tensorboard
+        :return: A dict that corresponds to the feed_dict needed for the update.
        """
        return {}

--- a/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
+++ b/ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
    CuriosityRewardSignal,
 )
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel

 logger = logging.getLogger("mlagents.trainers")



 def create_reward_signal(
-    policy: TFPolicy, name: str, config_entry: Dict[str, Any]
+    policy: TFPolicy,
+    policy_model: LearningModel,
+    name: str,
+    config_entry: Dict[str, Any],
 ) -> RewardSignal:
    """
    Creates a reward signal class based on the name and config entry provided as a dict.
        raise UnityTrainerException("Unknown reward signal type {0}".format(name))
    rcls.check_config(config_entry)
    try:
-        class_inst = rcls(policy, **config_entry)
+        class_inst = rcls(policy, policy_model, **config_entry)
    except TypeError:
        raise UnityTrainerException(
            "Unknown parameters given for reward signal {0}".format(name)
--- a/ml-agents/mlagents/trainers/ppo/models.py
+++ b/ml-agents/mlagents/trainers/ppo/models.py
        self.policy_loss = -tf.reduce_mean(
            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1]
        )
+        # For cleaner stats reporting
+        self.abs_policy_loss = tf.abs(self.policy_loss)

        self.loss = (
            self.policy_loss
--- a/ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
+++ b/ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
        """
        super().__init__(seed, brain, trainer_params, is_training, load)

-        with self.graph.as_default():
-            avg_grads = self.average_gradients([t.grads for t in self.towers])
-            self.update_batch = self.model.optimizer.apply_gradients(avg_grads)
-
-        self.update_dict = {"update_batch": self.update_batch}
-        self.update_dict.update(
-            {
-                "value_loss_" + str(i): self.towers[i].value_loss
-                for i in range(len(self.towers))
-            }
-        )
-        self.update_dict.update(
-            {
-                "policy_loss_" + str(i): self.towers[i].policy_loss
-                for i in range(len(self.towers))
-            }
-        )
-
-    def create_model(self, brain, trainer_params, reward_signal_configs, seed):
+    def create_model(
+        self, brain, trainer_params, reward_signal_configs, is_training, load, seed
+    ):
        """
        Create PPO models, one on each device
        :param brain: Assigned Brain object.
                        self.towers[-1].create_ppo_optimizer()
            self.model = self.towers[0]

+            avg_grads = self.average_gradients([t.grads for t in self.towers])
+            update_batch = self.model.optimizer.apply_gradients(avg_grads)
+
+            avg_value_loss = tf.reduce_mean(
+                tf.stack([model.value_loss for model in self.towers]), 0
+            )
+            avg_policy_loss = tf.reduce_mean(
+                tf.stack([model.policy_loss for model in self.towers]), 0
+            )
+
+        self.inference_dict.update(
+            {
+                "action": self.model.output,
+                "log_probs": self.model.all_log_probs,
+                "value_heads": self.model.value_heads,
+                "value": self.model.value,
+                "entropy": self.model.entropy,
+                "learning_rate": self.model.learning_rate,
+            }
+        )
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.model.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.model.memory_out
+        if (
+            is_training
+            and self.use_vec_obs
+            and trainer_params["normalize"]
+            and not load
+        ):
+            self.inference_dict["update_mean"] = self.model.update_normalization
+
+        self.total_policy_loss = self.model.abs_policy_loss
+        self.update_dict.update(
+            {
+                "value_loss": avg_value_loss,
+                "policy_loss": avg_policy_loss,
+                "update_batch": update_batch,
+            }
+        )
+
+    def create_reward_signals(self, reward_signal_configs):
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        self.reward_signal_towers = []
+        with self.graph.as_default():
+            with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE):
+                for device_id, device in enumerate(self.devices):
+                    with tf.device(device):
+                        reward_tower = {}
+                        for reward_signal, config in reward_signal_configs.items():
+                            reward_tower[reward_signal] = create_reward_signal(
+                                self, self.towers[device_id], reward_signal, config
+                            )
+                            for k, v in reward_tower[reward_signal].update_dict.items():
+                                self.update_dict[k + "_" + str(device_id)] = v
+                        self.reward_signal_towers.append(reward_tower)
+                for _, reward_tower in self.reward_signal_towers[0].items():
+                    for _, update_key in reward_tower.stats_name_to_update_name.items():
+                        all_reward_signal_stats = tf.stack(
+                            [
+                                self.update_dict[update_key + "_" + str(i)]
+                                for i in range(len(self.towers))
+                            ]
+                        )
+                        mean_reward_signal_stats = tf.reduce_mean(
+                            all_reward_signal_stats, 0
+                        )
+                        self.update_dict.update({update_key: mean_reward_signal_stats})
+
+            self.reward_signals = self.reward_signal_towers[0]
+
    @timed
    def update(self, mini_batch, num_sequences):
        """
        :return: Output from update process.
        """
        feed_dict = {}
+        stats_needed = self.stats_name_to_update_name
-                {k: v[i : i + device_batch_size] for (k, v) in mini_batch.items()}
+                {
+                    k: v[
+                        i * device_batch_size : i * device_batch_size
+                        + device_batch_size
+                    ]
+                    for (k, v) in mini_batch.items()
+                }
-        for batch, tower in zip(device_batches, self.towers):
+        for batch, tower, reward_tower in zip(
+            device_batches, self.towers, self.reward_signal_towers
+        ):
+            stats_needed.update(self.stats_name_to_update_name)
+            for _, reward_signal in reward_tower.items():
+                feed_dict.update(
+                    reward_signal.prepare_update(tower, batch, num_sequences)
+                )
+                stats_needed.update(reward_signal.stats_name_to_update_name)
-        out = self._execute_model(feed_dict, self.update_dict)
-        run_out = {}
-        run_out["value_loss"] = np.mean(
-            [out["value_loss_" + str(i)] for i in range(len(self.towers))]
-        )
-        run_out["policy_loss"] = np.mean(
-            [out["policy_loss_" + str(i)] for i in range(len(self.towers))]
-        )
-        run_out["update_batch"] = out["update_batch"]
-        return run_out
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        update_stats = {}
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats

    def average_gradients(self, tower_grads):
        """
--- a/ml-agents/mlagents/trainers/ppo/policy.py
+++ b/ml-agents/mlagents/trainers/ppo/policy.py
        super().__init__(seed, brain, trainer_params)

        reward_signal_configs = trainer_params["reward_signals"]
+        self.inference_dict = {}
+        self.update_dict = {}
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+        }
-        self.create_model(brain, trainer_params, reward_signal_configs, seed)
+        self.create_model(
+            brain, trainer_params, reward_signal_configs, is_training, load, seed
+        )
+        self.create_reward_signals(reward_signal_configs)
-        self.reward_signals = {}
-            # Create reward signals
-            for reward_signal, config in reward_signal_configs.items():
-                self.reward_signals[reward_signal] = create_reward_signal(
-                    self, reward_signal, config
-                )
-
            # Create pretrainer if needed
            if "pretraining" in trainer_params:
                BCModule.check_config(trainer_params["pretraining"])
        else:
            self._initialize_graph()

-        self.inference_dict = {
-            "action": self.model.output,
-            "log_probs": self.model.all_log_probs,
-            "value": self.model.value,
-            "value_heads": self.model.value_heads,
-            "entropy": self.model.entropy,
-            "learning_rate": self.model.learning_rate,
-        }
-        if self.use_continuous_act:
-            self.inference_dict["pre_action"] = self.model.output_pre
-        if self.use_recurrent:
-            self.inference_dict["memory_out"] = self.model.memory_out
-        if (
-            is_training
-            and self.use_vec_obs
-            and trainer_params["normalize"]
-            and not load
-        ):
-            self.inference_dict["update_mean"] = self.model.update_normalization
-
-        self.total_policy_loss = self.model.policy_loss
-
-        self.update_dict = {
-            "value_loss": self.model.value_loss,
-            "policy_loss": self.total_policy_loss,
-            "update_batch": self.model.update_batch,
-        }
-
-    def create_model(self, brain, trainer_params, reward_signal_configs, seed):
+    def create_model(
+        self, brain, trainer_params, reward_signal_configs, is_training, load, seed
+    ):
        """
        Create PPO model
        :param brain: Assigned Brain object.
            )
            self.model.create_ppo_optimizer()

+        self.inference_dict.update(
+            {
+                "action": self.model.output,
+                "log_probs": self.model.all_log_probs,
+                "value_heads": self.model.value_heads,
+                "value": self.model.value,
+                "entropy": self.model.entropy,
+                "learning_rate": self.model.learning_rate,
+            }
+        )
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.model.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.model.memory_out
+        if (
+            is_training
+            and self.use_vec_obs
+            and trainer_params["normalize"]
+            and not load
+        ):
+            self.inference_dict["update_mean"] = self.model.update_normalization
+
+        self.total_policy_loss = self.model.abs_policy_loss
+        self.update_dict.update(
+            {
+                "value_loss": self.model.value_loss,
+                "policy_loss": self.total_policy_loss,
+                "update_batch": self.model.update_batch,
+            }
+        )
+
+    def create_reward_signals(self, reward_signal_configs):
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        self.reward_signals = {}
+        with self.graph.as_default():
+            # Create reward signals
+            for reward_signal, config in reward_signal_configs.items():
+                self.reward_signals[reward_signal] = create_reward_signal(
+                    self, self.model, reward_signal, config
+                )
+                self.update_dict.update(self.reward_signals[reward_signal].update_dict)
+
    @timed
    def evaluate(self, brain_info):
        """
    @timed
    def update(self, mini_batch, num_sequences):
        """
-        Updates model using buffer.
-        :param num_sequences: Number of trajectories in batch.
-        :param mini_batch: Experience batch.
-        :return: Output from update process.
+        Performs update on model.
+        :param mini_batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
-        run_out = self._execute_model(feed_dict, self.update_dict)
-        return run_out
+        stats_needed = self.stats_name_to_update_name
+        update_stats = {}
+        # Collect feed dicts for all reward signals.
+        for _, reward_signal in self.reward_signals.items():
+            feed_dict.update(
+                reward_signal.prepare_update(self.model, mini_batch, num_sequences)
+            )
+            stats_needed.update(reward_signal.stats_name_to_update_name)
+
+        update_vals = self._execute_model(feed_dict, self.update_dict)
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
-            self.model.batch_size: num_sequences,
-            self.model.sequence_length: self.sequence_length,
-            self.model.mask_input: mini_batch["masks"],
-            self.model.advantage: mini_batch["advantages"],
-            self.model.all_old_log_probs: mini_batch["action_probs"],
+            model.batch_size: num_sequences,
+            model.sequence_length: self.sequence_length,
+            model.mask_input: mini_batch["masks"],
+            model.advantage: mini_batch["advantages"],
+            model.all_old_log_probs: mini_batch["action_probs"],
        }
        for name in self.reward_signals:
            feed_dict[model.returns_holders[name]] = mini_batch[
            ]

        if self.use_continuous_act:
-
            feed_dict[model.output_pre] = mini_batch["actions_pre"]
            feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
        else:
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        Uses demonstration_buffer to update the policy.
        The reward signal generators must be updated in this method at their own pace.
        """
+        buffer_length = len(self.training_buffer.update_buffer["actions"])
-            number_experiences=len(self.training_buffer.update_buffer["actions"]),
+            number_experiences=buffer_length,
            mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
        )
        self.cumulative_returns_since_policy_update = []
        )
-        value_total, policy_total = [], []
+
+        batch_update_stats = defaultdict(list)
-            for l in range(
-                0, len(self.training_buffer.update_buffer["actions"]), batch_size
-            ):
-                run_out = self.policy.update(
+            max_num_batch = buffer_length // batch_size
+            for l in range(0, max_num_batch * batch_size, batch_size):
+                update_stats = self.policy.update(
-                value_total.append(run_out["value_loss"])
-                policy_total.append(np.abs(run_out["policy_loss"]))
-        self.stats["Losses/Value Loss"].append(np.mean(value_total))
-        self.stats["Losses/Policy Loss"].append(np.mean(policy_total))
-        for _, reward_signal in self.policy.reward_signals.items():
-            update_stats = reward_signal.update(
-                self.training_buffer.update_buffer, n_sequences
-            )
-            for stat, val in update_stats.items():
-                self.stats[stat].append(val)
+                for stat_name, value in update_stats.items():
+                    batch_update_stats[stat_name].append(value)
+
+        for stat, stat_list in batch_update_stats.items():
+            self.stats[stat].append(np.mean(stat_list))
+
        if self.policy.bc_module:
            update_stats = self.policy.bc_module.update()
            for stat, val in update_stats.items():
--- a/ml-agents/mlagents/trainers/tests/test_multigpu.py
+++ b/ml-agents/mlagents/trainers/tests/test_multigpu.py
    mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"]
    mock_construct_feed_dict.return_value = {}
    mock_execute_model.return_value = {
-        "value_loss_0": 0.1,
-        "value_loss_1": 0.3,
-        "policy_loss_0": 0.5,
-        "policy_loss_1": 0.7,
+        "value_loss": 0.1,
+        "policy_loss": 0.3,
        "update_batch": None,
    }


    assert mock_mini_batch.items.call_count == len(mock_get_devices.return_value)
    assert mock_construct_feed_dict.call_count == len(mock_get_devices.return_value)
-    assert run_out["value_loss"] == 0.2
-    assert run_out["policy_loss"] == 0.6
+    assert run_out["Losses/Value Loss"] == 0.1
+    assert run_out["Losses/Policy Loss"] == 0.3


 if __name__ == "__main__":
--- a/ml-agents/mlagents/trainers/tests/test_reward_signals.py
+++ b/ml-agents/mlagents/trainers/tests/test_reward_signals.py

 def reward_signal_update(env, policy, reward_signal_name):
    buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
-    out = policy.reward_signals[reward_signal_name].update(buffer.update_buffer, 2)
+    feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
+        policy.model, buffer.update_buffer.make_mini_batch(0, 10), 2
+    )
+    out = policy._execute_model(
+        feed_dict, policy.reward_signals[reward_signal_name].update_dict
+    )
    assert type(out) is dict