floor material

--- a/config/ppo/3DBallHard.yaml
+++ b/config/ppo/3DBallHard.yaml
        gamma: 0.995
        strength: 1.0
    keep_checkpoints: 5
-    max_steps: 5000000
+    max_steps: 4000000
    time_horizon: 1000
    summary_freq: 12000
    threaded: true
--- a/ml-agents/mlagents/trainers/trainer_util.py
+++ b/ml-agents/mlagents/trainers/trainer_util.py
 from mlagents.trainers.trainer import Trainer
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.ppo.trainer import PPOTrainer
+from mlagents.trainers.ppo_transfer.trainer import PPOTransferTrainer
 from mlagents.trainers.sac.trainer import SACTrainer
 from mlagents.trainers.ghost.trainer import GhostTrainer
 from mlagents.trainers.ghost.controller import GhostController
        )
    elif trainer_type == TrainerType.SAC:
        trainer = SACTrainer(
+            brain_name,
+            min_lesson_length,
+            trainer_settings,
+            train_model,
+            load_model,
+            seed,
+            trainer_artifact_path,
+        )
+    elif trainer_type == TrainerType.PPO_Transfer:
+        trainer = PPOTransferTrainer(
            brain_name,
            min_lesson_length,
            trainer_settings,
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
 class TrainerType(Enum):
    PPO: str = "ppo"
    SAC: str = "sac"
+    PPO_Transfer: str = "ppo_transfer"
-        _mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
+        _mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings, 
+        TrainerType.PPO_Transfer: PPOSettings}
        return _mapping[self]


--- a/Project/ProjectSettings/EditorBuildSettings.asset
+++ b/Project/ProjectSettings/EditorBuildSettings.asset
 EditorBuildSettings:
  m_ObjectHideFlags: 0
  serializedVersion: 2
-  m_Scenes: []
+  m_Scenes:
+  - enabled: 0
+    path: Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity
+    guid: 35c41099ceec44889bdbe95ed86c97ac
+  - enabled: 1
+    path: Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity
+    guid: b9ac0cbf961bf4dacbfa0aa9c0d60aaa
  m_configObjects: {}
--- a/Project/ProjectSettings/UnityConnectSettings.asset
+++ b/Project/ProjectSettings/UnityConnectSettings.asset
 UnityConnectSettings:
  m_ObjectHideFlags: 0
  serializedVersion: 1
-  m_Enabled: 1
+  m_Enabled: 0
  m_TestMode: 0
  m_EventOldUrl: https://api.uca.cloud.unity3d.com/v1/events
  m_EventUrl: https://cdp.cloud.unity3d.com/v1/events
--- a/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
+++ b/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs

    public override void OnActionReceived(float[] vectorAction)
    {
-        var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
-        var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);
+        var actionZ = 1f * Mathf.Clamp(vectorAction[0], -1f, 1f);
+        var actionX = 1f * Mathf.Clamp(vectorAction[1], -1f, 1f);

        if ((gameObject.transform.rotation.z < 0.25f && actionZ > 0f) ||
            (gameObject.transform.rotation.z > -0.25f && actionZ < 0f))
--- a/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
  m_LocalPosition: {x: 200, y: 0, z: 0}
  m_LocalScale: {x: 1, y: 1, z: 1}
  m_Children:
-  - {fileID: 3386028169429758297}
+  - {fileID: 3386028169429758297}
  m_Father: {fileID: 0}
  m_RootOrder: 0
  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
  m_LocalScale: {x: 1, y: 1, z: 1}
  m_Children: []
  m_Father: {fileID: 4309919623019186}
-  m_RootOrder: 2
+  m_RootOrder: 1
  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
 --- !u!33 &33357510309310810
 MeshFilter:
  m_PrefabInstance: {fileID: 0}
  m_PrefabAsset: {fileID: 0}
  m_GameObject: {fileID: 1846708386698568}
-  m_Material: {fileID: 0}
+  m_Material: {fileID: 13400000, guid: dff6e5680d76643a481e8d81555ef3ee, type: 2}
  m_IsTrigger: 0
  m_Enabled: 1
  serializedVersion: 2
  - {fileID: 4856650706546504}
  - {fileID: 4791482523457020}
  m_Father: {fileID: 4309919623019186}
-  m_RootOrder: 1
+  m_RootOrder: 0
  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
 --- !u!1 &1995322274649904
 GameObject:
    - target: {fileID: 4845971001715176651, guid: 0456c89e8c9c243d595b039fe7aa0bf9,
        type: 3}
      propertyPath: m_RootOrder
-      value: 0
+      value: 2
      objectReference: {fileID: 0}
    - target: {fileID: 4845971001715176651, guid: 0456c89e8c9c243d595b039fe7aa0bf9,
        type: 3}
--- a/ml-agents/mlagents/trainers/policy/transfer_policy.py
+++ b/ml-agents/mlagents/trainers/policy/transfer_policy.py
+import os
+from typing import Any, Dict, Optional, List, Tuple
+from mlagents.tf_utils import tf
+from mlagents_envs.timers import timed
+from mlagents_envs.base_env import DecisionSteps
+from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.models import EncoderType
+from mlagents.trainers.models import ModelUtils
+from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.distributions import (
+    GaussianDistribution,
+    MultiCategoricalDistribution,
+)
+import tf_slim as slim
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+class GaussianEncoderDistribution:
+    def __init__(
+        self,
+        encoded: tf.Tensor,
+        feature_size: int
+    ):
+        self.mu = tf.layers.dense(
+            encoded,
+            feature_size,
+            activation=None,
+            name="mu",
+            kernel_initializer=ModelUtils.scaled_init(0.01),
+            reuse=tf.AUTO_REUSE,
+        )
+
+        self.log_sigma = tf.layers.dense(
+            encoded,
+            feature_size,
+            activation=None,
+            name="log_std",
+            kernel_initializer=ModelUtils.scaled_init(0.01),
+        )
+
+        self.sigma = tf.exp(self.log_sigma)
+    
+    def sample(self):
+        epsilon = tf.random_normal(tf.shape(self.mu))
+        sampled = self.mu + self.sigma * epsilon
+
+        return sampled
+    
+    def kl_standard(self):
+        """
+        KL divergence with a standard gaussian
+        """
+        kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1)
+        return kl
+
+
+class TransferPolicy(TFPolicy):
+    def __init__(
+        self,
+        seed: int,
+        brain: BrainParameters,
+        trainer_params: TrainerSettings,
+        is_training: bool,
+        model_path: str,
+        load: bool,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+        create_tf_graph: bool = True,
+    ):
+        """
+        Policy that uses a multilayer perceptron to map the observations to actions. Could
+        also use a CNN to encode visual input prior to the MLP. Supports discrete and
+        continuous action spaces, as well as recurrent networks.
+        :param seed: Random seed.
+        :param brain: Assigned BrainParameters object.
+        :param trainer_params: Defined training parameters.
+        :param is_training: Whether the model should be trained.
+        :param load: Whether a pre-trained model will be loaded or a new one created.
+        :param model_path: Path where the model should be saved and loaded.
+        :param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
+        """
+        super().__init__(seed, brain, trainer_params, model_path, load)
+        self.grads = None
+        self.update_batch: Optional[tf.Operation] = None
+        num_layers = self.network_settings.num_layers
+        self.h_size = self.network_settings.hidden_units
+        if num_layers < 1:
+            num_layers = 1
+        self.num_layers = num_layers
+        self.vis_encode_type = self.network_settings.vis_encode_type
+        self.tanh_squash = tanh_squash
+        self.reparameterize = reparameterize
+        self.condition_sigma_on_obs = condition_sigma_on_obs
+        self.trainable_variables: List[tf.Variable] = []
+        self.encoder = None
+        self.encoder_distribution = None
+        self.targ_encoder = None
+
+        # Model-based learning
+        self.feature_size = 16  # dimension of latent feature size
+        
+        # Non-exposed parameters; these aren't exposed because they don't have a
+        # good explanation and usually shouldn't be touched.
+        self.log_std_min = -20
+        self.log_std_max = 2
+        if create_tf_graph:
+            self.create_tf_graph()
+
+    def get_trainable_variables(self) -> List[tf.Variable]:
+        """
+        Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
+        returns empty list.
+        """
+        return self.trainable_variables
+
+    def create_tf_graph(self, 
+        encoder_layers = 1,
+        policy_layers = 1,
+        policy_units = 128,
+        transfer=False, 
+        separate_train=False, 
+        var_encoder=False,
+        var_predict=False,
+        predict_return=False,
+        inverse_model=False
+    ) -> None:
+        """
+        Builds the tensorflow graph needed for this policy.
+        """
+        self.inverse_model = inverse_model
+        with self.graph.as_default():
+            tf.set_random_seed(self.seed)
+            _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+            if len(_vars) > 0:
+                # We assume the first thing created in the graph is the Policy. If
+                # already populated, don't create more tensors.
+                return
+
+            self.create_input_placeholders()
+            self.current_action = tf.placeholder(
+                    shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
+                )
+
+            self.next_visual_in: List[tf.Tensor] = []
+            with tf.variable_scope("encoding"):
+                self.encoder, self.targ_encoder = self.create_encoders()
+            with tf.variable_scope("inverse"):
+                self.create_inverse_model(self.encoder, self.targ_encoder)
+            with tf.variable_scope("predict"):
+                self.create_forward_model(self.encoder, self.targ_encoder)
+
+            # if var_encoder:
+            #     self.encoder_distribution, self.encoder = self._create_var_encoder(
+            #         self.visual_in,
+            #         self.processed_vector_in,
+            #         self.h_size,
+            #         self.feature_size,
+            #         encoder_layers,
+            #         self.vis_encode_type
+            #     )
+
+            #     _, self.targ_encoder = self._create_var_target_encoder(
+            #         self.h_size,
+            #         self.feature_size,
+            #         encoder_layers,
+            #         self.vis_encode_type
+            #     )
+            # else:
+            #     self.encoder = self._create_encoder(
+            #         self.visual_in,
+            #         self.processed_vector_in,
+            #         self.h_size,
+            #         self.feature_size,
+            #         encoder_layers,
+            #         self.vis_encode_type
+            #     )
+
+            #     self.targ_encoder = self._create_target_encoder(
+            #         self.h_size,
+            #         self.feature_size,
+            #         encoder_layers,
+            #         self.vis_encode_type
+            #     )
+
+            # self._create_hard_copy()
+
+            # if var_predict:
+            #     self.predict_distribution, self.predict = self._create_var_world_model(
+            #         self.encoder,
+            #         self.h_size,
+            #         self.feature_size,
+            #         self.num_layers,
+            #         self.vis_encode_type,
+            #         predict_return
+            #     )
+            # else:
+            #     self.predict = self._create_world_model(
+            #         self.encoder,
+            #         self.h_size,
+            #         self.feature_size,
+            #         self.num_layers,
+            #         self.vis_encode_type,
+            #         predict_return
+            #     )
+            
+            # if inverse_model:
+            #     self._create_inverse_model(self.encoder, self.targ_encoder)
+
+            if self.use_continuous_act:
+                self._create_cc_actor(
+                    self.encoder,
+                    policy_units,
+                    policy_layers,
+                    self.tanh_squash,
+                    self.reparameterize,
+                    self.condition_sigma_on_obs,
+                    separate_train
+                )
+            else:
+                self._create_dc_actor(self.encoder, policy_units, policy_layers, separate_train)
+
+            self.trainable_variables = tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="encoding"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="predict"
+            )
+            self.trainable_variables += tf.get_collection(
+                tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
+            )  # LSTMs need to be root scope for Barracuda export
+            if not transfer:
+                self.trainable_variables += tf.get_collection(
+                    tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse"
+                )
+
+        self.inference_dict: Dict[str, tf.Tensor] = {
+            "action": self.output,
+            "log_probs": self.all_log_probs,
+            "entropy": self.entropy,
+        }
+        if self.use_continuous_act:
+            self.inference_dict["pre_action"] = self.output_pre
+        if self.use_recurrent:
+            self.inference_dict["memory_out"] = self.memory_out
+
+        # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
+        # it will re-load the full graph
+        self._initialize_graph()
+
+        # slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)
+    
+    def load_graph_partial(self, path: str, transfer_type="dynamics"):
+        load_nets = {"dynamics": ["policy", "predict", "value"], 
+            "observation": ["encoding", "inverse"]}
+        if self.inverse_model:
+            load_nets["dynamics"].append("inverse")
+        with self.graph.as_default():
+            for net in load_nets[transfer_type]:
+                variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net)
+                partial_saver = tf.train.Saver(variables_to_restore)
+                partial_model_checkpoint = os.path.join(path, f"{net}.ckpt")
+                partial_saver.restore(self.sess, partial_model_checkpoint)
+                print("loaded net", net, "from path", path)
+        # variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
+        # partial_saver = tf.train.Saver(variables_to_restore)
+        # partial_model_checkpoint = os.path.join(path, f"latent.ckpt")
+        # partial_saver.restore(self.sess, partial_model_checkpoint)
+        # print("loaded net latent from path", path)
+        
+        if transfer_type == "observation":
+            self.run_hard_copy()
+
+    def _create_world_model(
+        self,
+        encoder: tf.Tensor,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+        predict_return: bool=False
+    ) -> tf.Tensor:
+        """"
+        Builds the world model for state prediction
+        """
+        with self.graph.as_default():
+            with tf.variable_scope("predict"):
+                self.current_action = tf.placeholder(
+                    shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
+                )
+                hidden_stream = ModelUtils.create_vector_observation_encoder(
+                    tf.concat([encoder, self.current_action], axis=1),
+                    h_size,
+                    ModelUtils.swish,
+                    num_layers,
+                    scope=f"main_graph",
+                    reuse=False
+                )
+                if predict_return:
+                    predict = tf.layers.dense(
+                        hidden_stream,
+                        feature_size+1,
+                        name="next_state"
+                    )
+                else:
+                    predict = tf.layers.dense(
+                        hidden_stream,
+                        feature_size,
+                        name="next_state"
+                    )
+        return predict
+
+    def _create_var_world_model(
+        self,
+        encoder: tf.Tensor,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+        predict_return: bool=False
+    ) -> tf.Tensor:
+        """"
+        Builds the world model for state prediction
+        """
+        with self.graph.as_default():
+            with tf.variable_scope("predict"):
+                
+                hidden_stream = ModelUtils.create_vector_observation_encoder(
+                    tf.concat([encoder, self.current_action], axis=1),
+                    h_size,
+                    ModelUtils.swish,
+                    num_layers,
+                    scope=f"main_graph",
+                    reuse=False
+                )
+                with tf.variable_scope("latent"):
+                    if predict_return:
+                            predict_distribution = GaussianEncoderDistribution(
+                                hidden_stream,
+                                feature_size+1
+                            )
+                    else:
+                        predict_distribution = GaussianEncoderDistribution(
+                                hidden_stream,
+                                feature_size
+                            )
+
+                    predict = predict_distribution.sample()
+        return predict_distribution, predict
+
+    @timed
+    def evaluate(
+        self, decision_requests: DecisionSteps, global_agent_ids: List[str]
+    ) -> Dict[str, Any]:
+        """
+        Evaluates policy for the agent experiences provided.
+        :param decision_requests: DecisionSteps object containing inputs.
+        :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
+        :return: Outputs from network as defined by self.inference_dict.
+        """
+        feed_dict = {
+            self.batch_size_ph: len(decision_requests),
+            self.sequence_length_ph: 1,
+        }
+        if self.use_recurrent:
+            if not self.use_continuous_act:
+                feed_dict[self.prev_action] = self.retrieve_previous_action(
+                    global_agent_ids
+                )
+            feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
+        feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
+        run_out = self._execute_model(feed_dict, self.inference_dict)
+        return run_out
+
+    def _create_target_encoder(
+        self,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        self.visual_next = ModelUtils.create_visual_input_placeholders(
+            self.brain.camera_resolutions
+        )
+        self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
+        if self.normalize:
+            self.processed_vector_next = ModelUtils.normalize_vector_obs(
+                self.vector_next,
+                self.running_mean,
+                self.running_variance,
+                self.normalization_steps,
+            )
+        else:
+            self.processed_vector_next = self.vector_next
+
+        with tf.variable_scope("target_enc"):
+            hidden_stream_targ = ModelUtils.create_observation_streams(
+                self.visual_next,
+                self.processed_vector_next,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+
+            latent_targ = tf.layers.dense(
+                    hidden_stream_targ,
+                    feature_size,
+                    name="latent"
+                )
+        return tf.stop_gradient(latent_targ)
+    
+    def _create_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        """
+        Creates an encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+        with tf.variable_scope("encoding"):
+            hidden_stream = ModelUtils.create_observation_streams(
+                visual_in,
+                vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+
+            latent = tf.layers.dense(
+                    hidden_stream,
+                    feature_size,
+                    name="latent"
+                )
+        return latent
+    
+    def _create_var_target_encoder(
+        self,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        self.visual_next = ModelUtils.create_visual_input_placeholders(
+            self.brain.camera_resolutions
+        )
+        self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
+        if self.normalize:
+            self.processed_vector_next = ModelUtils.normalize_vector_obs(
+                self.vector_next,
+                self.running_mean,
+                self.running_variance,
+                self.normalization_steps,
+            )
+        else:
+            self.processed_vector_next = self.vector_next
+
+        with tf.variable_scope("target_enc"):
+            hidden_stream_targ = ModelUtils.create_observation_streams(
+                self.visual_next,
+                self.processed_vector_next,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+
+            with tf.variable_scope("latent"):
+                latent_targ_distribution = GaussianEncoderDistribution(
+                    hidden_stream_targ,
+                    feature_size
+                )
+
+                latent_targ = latent_targ_distribution.sample()
+
+        return latent_targ_distribution, tf.stop_gradient(latent_targ)
+
+    def _create_var_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType
+    ) -> tf.Tensor:
+        """
+        Creates a variational encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+
+        with tf.variable_scope("encoding"):
+            hidden_stream = ModelUtils.create_observation_streams(
+                visual_in,
+                vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+
+            with tf.variable_scope("latent"):
+                latent_distribution = GaussianEncoderDistribution(
+                    hidden_stream,
+                    feature_size
+                )
+
+                latent = latent_distribution.sample()
+
+        return latent_distribution, latent
+    
+    def _create_hard_copy(self):
+        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc')
+        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding')
+
+        with tf.variable_scope('hard_replacement'):
+            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
+
+    def run_hard_copy(self):
+        self.sess.run(self.target_replace_op)
+
+    def _create_inverse_model(
+        self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
+    ) -> None:
+        """
+        Creates inverse model TensorFlow ops for Curiosity module.
+        Predicts action taken given current and future encoded states.
+        :param encoded_state: Tensor corresponding to encoded current state.
+        :param encoded_next_state: Tensor corresponding to encoded next state.
+        """
+        with tf.variable_scope("inverse"):
+            combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
+            hidden = tf.layers.dense(combined_input, self.h_size, activation=ModelUtils.swish)
+            if self.brain.vector_action_space_type == "continuous":
+                pred_action = tf.layers.dense(
+                    hidden, self.act_size[0], activation=None
+                )
+                squared_difference = tf.reduce_sum(
+                    tf.squared_difference(pred_action, self.current_action), axis=1
+                )
+                self.inverse_loss = tf.reduce_mean(
+                    tf.dynamic_partition(squared_difference, self.mask, 2)[1]
+                )
+            else:
+                pred_action = tf.concat(
+                    [
+                        tf.layers.dense(
+                            hidden, self.act_size[i], activation=tf.nn.softmax
+                        )
+                        for i in range(len(self.act_size))
+                    ],
+                    axis=1,
+                )
+                cross_entropy = tf.reduce_sum(
+                    -tf.log(pred_action + 1e-10) * self.current_action, axis=1
+                )
+                self.inverse_loss = tf.reduce_mean(
+                    tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
+                )
+    
+    def _create_cc_actor(
+        self,
+        encoded: tf.Tensor,
+        h_size: int,
+        num_layers: int,
+        tanh_squash: bool = False,
+        reparameterize: bool = False,
+        condition_sigma_on_obs: bool = True,
+        separate_train: bool = False
+    ) -> None:
+        """
+        Creates Continuous control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :param tanh_squash: Whether to use a tanh function, or a clipped output.
+        :param reparameterize: Whether we are using the resampling trick to update the policy.
+        """
+        if self.use_recurrent:
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        if separate_train:
+            hidden_policy = tf.stop_gradient(hidden_policy)
+
+        with tf.variable_scope("policy"):
+            hidden_policy = ModelUtils.create_vector_observation_encoder(
+                hidden_policy,
+                h_size,
+                ModelUtils.swish,
+                num_layers,
+                scope=f"main_graph",
+                reuse=False,
+            )
+            distribution = GaussianDistribution(
+                hidden_policy,
+                self.act_size,
+                reparameterize=reparameterize,
+                tanh_squash=tanh_squash,
+                condition_sigma=condition_sigma_on_obs,
+            )
+
+        if tanh_squash:
+            self.output_pre = distribution.sample
+            self.output = tf.identity(self.output_pre, name="action")
+        else:
+            self.output_pre = distribution.sample
+            # Clip and scale output to ensure actions are always within [-1, 1] range.
+            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
+            self.output = tf.identity(output_post, name="action")
+
+        self.selected_actions = tf.stop_gradient(self.output)
+
+        self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
+        self.entropy = distribution.entropy
+
+        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
+        self.total_log_probs = distribution.total_log_probs
+
+    def _create_dc_actor(
+        self, 
+        encoded: tf.Tensor, 
+        h_size: int, 
+        num_layers: int, 
+        separate_train: bool = False
+    ) -> None:
+        """
+        Creates Discrete control actor-critic model.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        """
+        if self.use_recurrent:
+            self.prev_action = tf.placeholder(
+                shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
+            )
+            prev_action_oh = tf.concat(
+                [
+                    tf.one_hot(self.prev_action[:, i], self.act_size[i])
+                    for i in range(len(self.act_size))
+                ],
+                axis=1,
+            )
+            hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
+
+            self.memory_in = tf.placeholder(
+                shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
+            )
+            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
+                hidden_policy,
+                self.memory_in,
+                self.sequence_length_ph,
+                name="lstm_policy",
+            )
+
+            self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
+        else:
+            hidden_policy = encoded
+
+        if separate_train:
+            hidden_policy = tf.stop_gradient(hidden_policy)
+        self.action_masks = tf.placeholder(
+            shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
+        )
+
+        with tf.variable_scope("policy"):
+            hidden_policy = ModelUtils.create_vector_observation_encoder(
+                hidden_policy,
+                h_size,
+                ModelUtils.swish,
+                num_layers,
+                scope=f"main_graph",
+                reuse=False,
+            )
+            distribution = MultiCategoricalDistribution(
+                hidden_policy, self.act_size, self.action_masks
+            )
+        # It's important that we are able to feed_dict a value into this tensor to get the
+        # right one-hot encoding, so we can't do identity on it.
+        self.output = distribution.sample
+        self.all_log_probs = tf.identity(distribution.log_probs, name="action")
+        self.selected_actions = tf.stop_gradient(
+            distribution.sample_onehot
+        )  # In discrete, these are onehot
+        self.entropy = distribution.entropy
+        self.total_log_probs = distribution.total_log_probs
+
+    def save_model(self, steps):
+        """
+        Saves the model
+        :param steps: The number of steps the model was trained for
+        :return:
+        """
+        with self.graph.as_default():
+            last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
+            self.saver.save(self.sess, last_checkpoint)
+            tf.train.write_graph(
+                self.graph, self.model_path, "raw_graph_def.pb", as_text=False
+            )
+            # save each net separately
+            policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+            policy_saver = tf.train.Saver(policy_vars)
+            policy_checkpoint = os.path.join(self.model_path, f"policy.ckpt")
+            policy_saver.save(self.sess, policy_checkpoint)
+
+            encoding_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+            encoding_saver = tf.train.Saver(encoding_vars)
+            encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
+            encoding_saver.save(self.sess, encoding_checkpoint)
+
+            # latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
+            # latent_saver = tf.train.Saver(latent_vars)
+            # latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt")
+            # latent_saver.save(self.sess, latent_checkpoint)
+
+            predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
+            predict_saver = tf.train.Saver(predict_vars)
+            predict_checkpoint = os.path.join(self.model_path, f"predict.ckpt")
+            predict_saver.save(self.sess, predict_checkpoint)
+
+            value_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+            value_saver = tf.train.Saver(value_vars)
+            value_checkpoint = os.path.join(self.model_path, f"value.ckpt")
+            value_saver.save(self.sess, value_checkpoint)
+
+            if self.inverse_model:
+                inverse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")
+                inverse_saver = tf.train.Saver(inverse_vars)
+                inverse_checkpoint = os.path.join(self.model_path, f"inverse.ckpt")
+                inverse_saver.save(self.sess, inverse_checkpoint)
+
+    def get_encoder_weights(self):
+        with self.graph.as_default():
+            enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/main_graph_0/hidden_0/bias:0")
+            targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/main_graph_0/hidden_0/bias:0")
+            print("encoding:", self.sess.run(enc))
+            print("target:", self.sess.run(targ))
+    
+    def create_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        encoded_state_list = []
+        encoded_next_state_list = []
+
+        if self.vis_obs_size > 0:
+            self.next_visual_in = []
+            visual_encoders = []
+            next_visual_encoders = []
+            for i in range(self.vis_obs_size):
+                # Create input ops for next (t+1) visual observations.
+                next_visual_input = ModelUtils.create_visual_input(
+                    self.brain.camera_resolutions[i],
+                    name="curiosity_next_visual_observation_" + str(i),
+                )
+                self.next_visual_in.append(next_visual_input)
+
+                # Create the encoder ops for current and next visual input.
+                # Note that these encoders are siamese.
+                encoded_visual = ModelUtils.create_visual_observation_encoder(
+                    self.visual_in[i],
+                    self.h_size,
+                    ModelUtils.swish,
+                    self.num_layers,
+                    "curiosity_stream_{}_visual_obs_encoder".format(i),
+                    False,
+                )
+
+                encoded_next_visual = ModelUtils.create_visual_observation_encoder(
+                    self.next_visual_in[i],
+                    self.h_size,
+                    ModelUtils.swish,
+                    self.num_layers,
+                    "curiosity_stream_{}_visual_obs_encoder".format(i),
+                    True,
+                )
+                visual_encoders.append(encoded_visual)
+                next_visual_encoders.append(encoded_next_visual)
+
+            hidden_visual = tf.concat(visual_encoders, axis=1)
+            hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
+            encoded_state_list.append(hidden_visual)
+            encoded_next_state_list.append(hidden_next_visual)
+
+        if self.vec_obs_size > 0:
+            # Create the encoder ops for current and next vector input.
+            # Note that these encoders are siamese.
+            # Create input op for next (t+1) vector observation.
+            self.next_vector_in = tf.placeholder(
+                shape=[None, self.vec_obs_size],
+                dtype=tf.float32,
+                name="curiosity_next_vector_observation",
+            )
+
+            encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
+                self.vector_in,
+                self.h_size,
+                ModelUtils.swish,
+                self.num_layers,
+                "curiosity_vector_obs_encoder",
+                False,
+            )
+            encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
+                self.next_vector_in,
+                self.h_size,
+                ModelUtils.swish,
+                self.num_layers,
+                "curiosity_vector_obs_encoder",
+                True,
+            )
+            encoded_state_list.append(encoded_vector_obs)
+            encoded_next_state_list.append(encoded_next_vector_obs)
+
+        encoded_state = tf.concat(encoded_state_list, axis=1)
+        encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
+
+        encoded_state = tf.layers.dense(
+                    encoded_state,
+                    self.feature_size,
+                    name="latent"
+                )
+        encoded_next_state = tf.layers.dense(
+                    encoded_next_state,
+                    self.feature_size,
+                    name="latent",
+                    reuse=True
+                )
+
+        return encoded_state, encoded_next_state
+
+    def create_inverse_model(
+        self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
+    ) -> None:
+        """
+        Creates inverse model TensorFlow ops for Curiosity module.
+        Predicts action taken given current and future encoded states.
+        :param encoded_state: Tensor corresponding to encoded current state.
+        :param encoded_next_state: Tensor corresponding to encoded next state.
+        """
+        combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
+        # hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
+        if self.brain.vector_action_space_type == "continuous":
+            pred_action = tf.layers.dense(
+                combined_input, self.act_size[0], activation=None
+            )
+            squared_difference = tf.reduce_sum(
+                tf.squared_difference(pred_action, self.current_action), axis=1
+            )
+            self.inverse_loss = tf.reduce_mean(
+                tf.dynamic_partition(squared_difference, self.mask, 2)[1]
+            )
+        else:
+            pred_action = tf.concat(
+                [
+                    tf.layers.dense(
+                        combined_input, self.act_size[i], activation=tf.nn.softmax
+                    )
+                    for i in range(len(self.act_size))
+                ],
+                axis=1,
+            )
+            cross_entropy = tf.reduce_sum(
+                -tf.log(pred_action + 1e-10) * self.current_action, axis=1
+            )
+            self.inverse_loss = tf.reduce_mean(
+                tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
+            )
+
+    def create_forward_model(
+        self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
+    ) -> None:
+        """
+        Creates forward model TensorFlow ops for Curiosity module.
+        Predicts encoded future state based on encoded current state and given action.
+        :param encoded_state: Tensor corresponding to encoded current state.
+        :param encoded_next_state: Tensor corresponding to encoded next state.
+        """
+        combined_input = tf.concat(
+            [encoded_state, self.current_action], axis=1
+        )
+        # hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
+        predict = tf.layers.dense(
+            combined_input,
+            self.h_size
+            * (self.vis_obs_size + int(self.vec_obs_size > 0)),
+            activation=None,
+        )
+        self.predict = tf.layers.dense(
+            predict,
+            self.feature_size,
+            name="latent"
+        )
+        squared_difference = 0.5 * tf.reduce_sum(
+            tf.squared_difference(self.predict, encoded_next_state), axis=1
+        )
+        self.intrinsic_reward = squared_difference
+        self.forward_loss = tf.reduce_mean(
+            tf.dynamic_partition(squared_difference, self.mask, 2)[1]
+        )
--- a/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials.meta
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials.meta
+fileFormatVersion: 2
+guid: 87be98804068a4296bc57ee51587f7b7
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/config/ppo_transfer/3DBall.yaml
+++ b/config/ppo_transfer/3DBall.yaml
+behaviors:
+  3DBall:
+    trainer_type: ppo_transfer
+    hyperparameters:
+      batch_size: 64
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.99
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.99
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 500000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/config/ppo_transfer/3DBallHard.yaml
+++ b/config/ppo_transfer/3DBallHard.yaml
+behaviors:
+  3DBallHard:
+    trainer_type: ppo_transfer
+    hyperparameters:
+      batch_size: 1200
+      buffer_size: 12000
+      learning_rate: 0.0003
+      beta: 0.001
+      epsilon: 0.2
+      lambd: 0.95
+      num_epoch: 3
+      learning_rate_schedule: linear
+    network_settings:
+      normalize: true
+      hidden_units: 128
+      num_layers: 1
+      vis_encode_type: simple
+    reward_signals:
+      extrinsic:
+        gamma: 0.995
+        strength: 1.0
+    keep_checkpoints: 5
+    max_steps: 4000000
+    time_horizon: 1000
+    summary_freq: 12000
+    threaded: true
--- a/ml-agents/mlagents/trainers/ppo_transfer/init.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/init.py
--- a/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
+from typing import Optional, Any, Dict, cast
+import numpy as np
+import os
+from mlagents.tf_utils import tf
+from mlagents_envs.timers import timed
+from mlagents.trainers.models import ModelUtils, EncoderType
+from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
+from mlagents.trainers.policy.transfer_policy import TransferPolicy
+from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
+import tf_slim as slim
+
+class PPOTransferOptimizer(TFOptimizer):
+    def __init__(self, policy: TransferPolicy, trainer_params: TrainerSettings):
+        """
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        The PPO optimizer has a value esåtimator and a loss function.
+        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
+        """
+        
+        self.separate_value_train = False
+        self.separate_policy_train = False
+        self.use_var_encoder = False
+        self.use_var_predict = False
+        self.with_prior = False
+        self.use_inverse_model = True
+        self.predict_return = False
+
+        self.use_alter = False
+        self.in_batch_alter = False
+        self.in_epoch_alter = False
+        
+        self.num_updates = 0
+        self.alter_every = 400
+        self.copy_every = 1
+        self.train_type = "all"
+        
+        # Transfer
+        self.use_transfer = False
+        self.smart_transfer = False
+        self.conv_thres = 1e-6
+        self.old_loss = np.inf
+        self.update_mode = "model"
+        self.transfer_path = "results/BallSingle_nosep_cmodel_small/3DBall"
+        self.transfer_type = "dynamics"
+
+        self.ppo_update_dict: Dict[str, tf.Tensor] = {}
+        self.model_update_dict: Dict[str, tf.Tensor] = {}
+
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+        policy.create_tf_graph(1, 1, 128, self.use_transfer, self.separate_policy_train, 
+            self.use_var_encoder, self.use_var_predict, self.predict_return, self.use_inverse_model)
+        
+        with policy.graph.as_default():
+            super().__init__(policy, trainer_params)
+            hyperparameters: PPOSettings = cast(
+                PPOSettings, trainer_params.hyperparameters
+            )
+
+            lr = float(hyperparameters.learning_rate)
+            self._schedule = hyperparameters.learning_rate_schedule
+            epsilon = float(hyperparameters.epsilon)
+            beta = float(hyperparameters.beta)
+            max_step = float(trainer_params.max_steps)
+            policy_network_settings = policy.network_settings
+            h_size = int(policy_network_settings.hidden_units)
+            num_layers = policy_network_settings.num_layers
+            vis_encode_type = policy_network_settings.vis_encode_type
+            self.burn_in_ratio = 0.0
+
+            self.stream_names = list(self.reward_signals.keys())
+
+            self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
+            self.grads = None
+            self.update_batch: Optional[tf.Operation] = None
+
+            self.stats_name_to_update_name = {
+                "Losses/Value Loss": "value_loss",
+                "Losses/Policy Loss": "policy_loss",
+                "Losses/Model Loss": "model_loss",
+                "Policy/Learning Rate": "learning_rate",
+                "Policy/Epsilon": "decay_epsilon",
+                "Policy/Beta": "decay_beta",
+            }
+            if self.policy.use_recurrent:
+                self.m_size = self.policy.m_size
+                self.memory_in = tf.placeholder(
+                    shape=[None, self.m_size],
+                    dtype=tf.float32,
+                    name="recurrent_value_in",
+                )
+            if num_layers < 1:
+                num_layers = 1
+
+            with tf.variable_scope("value"):
+                if policy.use_continuous_act:
+                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
+                else:
+                    self._create_dc_critic(h_size, num_layers, vis_encode_type)
+            
+            with tf.variable_scope("optimizer/"):
+                self.learning_rate = ModelUtils.create_schedule(
+                    self._schedule,
+                    lr,
+                    self.policy.global_step,
+                    int(max_step),
+                    min_value=1e-10,
+                )
+                self._create_losses(
+                    self.policy.total_log_probs,
+                    self.old_log_probs,
+                    self.value_heads,
+                    self.policy.entropy,
+                    self.policy.targ_encoder,
+                    self.policy.predict,
+                    self.policy.encoder_distribution,
+                    beta,
+                    epsilon,
+                    lr,
+                    max_step,
+                )
+                self._create_ppo_optimizer_ops()
+
+                self.update_dict.update(
+                    {
+                        "value_loss": self.value_loss,
+                        "policy_loss": self.abs_policy_loss,
+                        "model_loss": self.model_loss,
+                        "update_batch": self.update_batch,
+                        "learning_rate": self.learning_rate,
+                        "decay_epsilon": self.decay_epsilon,
+                        "decay_beta": self.decay_beta,
+                    }
+                )
+
+                if self.use_alter or self.smart_transfer or self.in_batch_alter or self.in_epoch_alter:
+                    self._init_alter_update()
+            
+            self.policy.initialize_or_load()
+            if self.use_transfer:
+                self.policy.load_graph_partial(self.transfer_path, self.transfer_type)
+            self.policy.get_encoder_weights()
+            # saver = tf.train.Saver()
+            # model_checkpoint = os.path.join(self.transfer_path, f"model-4000544.ckpt")
+            # saver.restore(self.sess, model_checkpoint)
+            # self.policy._set_step(0)
+
+            slim.model_analyzer.analyze_vars(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), print_info=True)
+            
+            print("All variables in the graph:")
+            for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
+                print(variable.name)
+            # tf.summary.FileWriter(self.policy.model_path, self.sess.graph)
+
+    
+    def _create_cc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Continuous control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        if self.separate_value_train:
+            input_state = tf.stop_gradient(self.policy.encoder)
+        else:
+            input_state = self.policy.encoder
+        hidden_value = ModelUtils.create_vector_observation_encoder(
+            input_state,
+            h_size,
+            ModelUtils.swish,
+            num_layers,
+            scope=f"main_graph",
+            reuse=False
+        )
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        self.old_log_probs = tf.reduce_sum(
+            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
+        )
+
+    def _create_dc_critic(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Discrete control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        if self.separate_value_train:
+            input_state = tf.stop_gradient(self.policy.encoder)
+        else:
+            input_state = self.policy.encoder
+        hidden_value = ModelUtils.create_vector_observation_encoder(
+            input_state,
+            h_size,
+            ModelUtils.swish,
+            num_layers,
+            scope=f"main_graph",
+            reuse=False
+        )
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        # Break old log probs into separate branches
+        old_log_prob_branches = ModelUtils.break_into_branches(
+            self.all_old_log_probs, self.policy.act_size
+        )
+
+        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            old_log_prob_branches, self.policy.action_masks, self.policy.act_size
+        )
+
+        action_idx = [0] + list(np.cumsum(self.policy.act_size))
+
+        self.old_log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.policy.selected_actions[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                            logits=old_normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.policy.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
+
+    def _create_losses(
+        self, probs, old_probs, value_heads, entropy, targ_encoder, predict, encoder_distribution,
+         beta, epsilon, lr, max_step
+    ):
+        """
+        Creates training-specific Tensorflow ops for PPO models.
+        :param probs: Current policy probabilities
+        :param old_probs: Past policy probabilities
+        :param value_heads: Value estimate tensors from each value stream
+        :param beta: Entropy regularization strength
+        :param entropy: Current policy entropy
+        :param epsilon: Value for policy-divergence threshold
+        :param lr: Learning rate
+        :param max_step: Total number of training steps.
+        """
+        self.returns_holders = {}
+        self.old_values = {}
+        for name in value_heads.keys():
+            returns_holder = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_returns".format(name)
+            )
+            old_value = tf.placeholder(
+                shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
+            )
+            self.returns_holders[name] = returns_holder
+            self.old_values[name] = old_value
+        self.advantage = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="advantages"
+        )
+        advantage = tf.expand_dims(self.advantage, -1)
+
+        self.decay_epsilon = ModelUtils.create_schedule(
+            self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
+        )
+        self.decay_beta = ModelUtils.create_schedule(
+            self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
+        )
+
+        value_losses = []
+        for name, head in value_heads.items():
+            clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
+                tf.reduce_sum(head, axis=1) - self.old_values[name],
+                -self.decay_epsilon,
+                self.decay_epsilon,
+            )
+            v_opt_a = tf.squared_difference(
+                self.returns_holders[name], tf.reduce_sum(head, axis=1)
+            )
+            v_opt_b = tf.squared_difference(
+                self.returns_holders[name], clipped_value_estimate
+            )
+            value_loss = tf.reduce_mean(
+                tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
+                    1
+                ]
+            )
+            value_losses.append(value_loss)
+        self.value_loss = tf.reduce_mean(value_losses)
+
+        r_theta = tf.exp(probs - old_probs)
+        p_opt_a = r_theta * advantage
+        p_opt_b = (
+            tf.clip_by_value(
+                r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
+            )
+            * advantage
+        )
+        self.policy_loss = -tf.reduce_mean(
+            tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
+        )
+        # For cleaner stats reporting
+        self.abs_policy_loss = tf.abs(self.policy_loss)
+
+        # encoder and predict loss
+        self.dis_returns = tf.placeholder(
+            shape=[None], dtype=tf.float32, name="dis_returns"
+        )
+        # target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
+        # if self.predict_return:
+        #     self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target)) 
+        # else:
+        #     self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder)) 
+        # if self.with_prior:
+        #     if self.use_var_encoder:
+        #         self.model_loss += encoder_distribution.kl_standard()
+        #     if self.use_var_predict:
+        #         self.model_loss += self.policy.predict_distribution.kl_standard()
+
+        # if self.use_inverse_model:
+        #     self.model_loss += self.policy.inverse_loss
+        self.model_loss = 0.2 * self.policy.forward_loss + 0.8 * self.policy.inverse_loss
+        self.loss = (
+            self.policy_loss
+            + self.model_loss
+            + 0.5 * self.value_loss
+            - self.decay_beta
+            * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
+        )
+
+        self.ppo_loss = (
+            self.policy_loss
+            + 0.5 * self.value_loss
+            - self.decay_beta
+            * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
+        )
+
+    def _create_ppo_optimizer_ops(self):
+        if self.use_transfer:
+            if self.transfer_type == "dynamics":
+                if self.train_type == "all":
+                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+                elif self.train_type == "encoding":
+                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+                    train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+                # train_vars += self.policy.get_trainable_variables
+                print("trainable", train_vars)
+                # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
+                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
+            elif self.transfer_type == "observation":
+                # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+                train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") \
+                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
+        else:
+            train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+            train_vars += self.policy.get_trainable_variables()
+            print("trainable", train_vars)
+
+        self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)
+        self.update_batch = self.tf_optimizer.minimize(self.loss, var_list=train_vars)
+        
+        
+    def _init_alter_update(self):
+        if self.train_type == "all":
+            train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+        elif self.train_type == "encoding":
+            train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
+        
+        policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
+        model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+
+        self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
+        self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
+
+        self.model_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
+        self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
+
+        self.ppo_update_dict.update(
+            {
+                "value_loss": self.value_loss,
+                "policy_loss": self.abs_policy_loss,
+                "update_batch": self.ppo_update_batch,
+                "learning_rate": self.learning_rate,
+                "decay_epsilon": self.decay_epsilon,
+                "decay_beta": self.decay_beta,
+            }
+        )
+
+        self.model_update_dict.update(
+            {
+                "model_loss": self.model_loss,
+                "update_batch": self.model_update_batch,
+                "learning_rate": self.learning_rate,
+                "decay_epsilon": self.decay_epsilon,
+                "decay_beta": self.decay_beta,
+            }
+        )
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param mini_batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        feed_dict = self._construct_feed_dict(batch, num_sequences)
+        stats_needed = self.stats_name_to_update_name
+        update_stats = {}
+        # Collect feed dicts for all reward signals.
+        for _, reward_signal in self.reward_signals.items():
+            feed_dict.update(
+                reward_signal.prepare_update(self.policy, batch, num_sequences)
+            )
+            stats_needed.update(reward_signal.stats_name_to_update_name)
+        
+        if self.use_alter:
+            if self.num_updates / self.alter_every == 0:
+                update_vals = self._execute_model(feed_dict, self.update_dict)
+                if self.num_updates % self.alter_every == 0:
+                    print("start update all", self.num_updates)
+            elif (self.num_updates / self.alter_every) % 2 == 1:
+                update_vals = self._execute_model(feed_dict, self.model_update_dict)
+                if self.num_updates % self.alter_every == 0:
+                    print("start update model", self.num_updates)
+            else: # (self.num_updates / self.alter_every) % 2 == 0:
+                update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
+                if self.num_updates % self.alter_every == 0:
+                    print("start update policy", self.num_updates)
+            
+        elif self.in_batch_alter:
+            update_vals = self._execute_model(feed_dict, self.model_update_dict)
+            update_vals.update(self._execute_model(feed_dict, self.ppo_update_dict))
+        elif self.use_transfer and self.smart_transfer:
+            if self.update_mode == "model":
+                update_vals = self._execute_model(feed_dict, self.update_dict)
+                cur_loss = update_vals["model_loss"]
+                print("model loss:", cur_loss)
+                if abs(cur_loss - self.old_loss) < self.conv_thres:
+                    self.update_mode = "policy"
+                    print("start to train policy")
+                else:
+                    self.old_loss = cur_loss
+            if self.update_mode == "policy":
+                update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
+        else:
+            update_vals = self._execute_model(feed_dict, self.update_dict)
+
+        # update target encoder
+        # if self.num_updates % self.copy_every == 0:
+        #     self.policy.run_hard_copy()
+            # print("copy")
+            # self.policy.get_encoder_weights()
+
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+
+        self.num_updates += 1
+        return update_stats
+
+    def update_part(self, batch: AgentBuffer, num_sequences: int, update_type: str="policy") -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param mini_batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        feed_dict = self._construct_feed_dict(batch, num_sequences)
+        if update_type == "model":
+            stats_needed = {
+                "Losses/Model Loss": "model_loss",
+                "Policy/Learning Rate": "learning_rate",
+                "Policy/Epsilon": "decay_epsilon",
+                "Policy/Beta": "decay_beta",
+            }
+        elif update_type == "policy":
+            stats_needed = {
+                "Losses/Value Loss": "value_loss",
+                "Losses/Policy Loss": "policy_loss",
+                "Policy/Learning Rate": "learning_rate",
+                "Policy/Epsilon": "decay_epsilon",
+                "Policy/Beta": "decay_beta",
+            }
+        update_stats = {}
+        # Collect feed dicts for all reward signals.
+        for _, reward_signal in self.reward_signals.items():
+            feed_dict.update(
+                reward_signal.prepare_update(self.policy, batch, num_sequences)
+            )
+            stats_needed.update(reward_signal.stats_name_to_update_name)
+        
+        if update_type == "model":
+            update_vals = self._execute_model(feed_dict, self.model_update_dict)
+        elif update_type == "policy":
+            update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
+
+        # update target encoder
+        # self.policy.run_hard_copy()
+
+        for stat_name, update_name in stats_needed.items():
+            update_stats[stat_name] = update_vals[update_name]
+        return update_stats
+
+    def _construct_feed_dict(
+        self, mini_batch: AgentBuffer, num_sequences: int
+    ) -> Dict[tf.Tensor, Any]:
+        # Do an optional burn-in for memories
+        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
+        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask = np.tile(burn_in_mask, num_sequences)
+        feed_dict = {
+            self.policy.batch_size_ph: num_sequences,
+            self.policy.sequence_length_ph: self.policy.sequence_length,
+            self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
+            self.advantage: mini_batch["advantages"],
+            self.all_old_log_probs: mini_batch["action_probs"],
+            # self.policy.processed_vector_next: mini_batch["next_vector_in"],
+            self.policy.next_vector_in: mini_batch["next_vector_in"],
+            self.policy.current_action: mini_batch["actions"],
+            self.dis_returns: mini_batch["discounted_returns"]
+        }
+        for name in self.reward_signals:
+            feed_dict[self.returns_holders[name]] = mini_batch[
+                "{}_returns".format(name)
+            ]
+            feed_dict[self.old_values[name]] = mini_batch[
+                "{}_value_estimates".format(name)
+            ]
+
+        if self.policy.output_pre is not None and "actions_pre" in mini_batch:
+            feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
+        else:
+            feed_dict[self.policy.output] = mini_batch["actions"]
+            if self.policy.use_recurrent:
+                feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
+            feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
+        if "vector_obs" in mini_batch:
+            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i, _ in enumerate(self.policy.visual_in):
+                feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [
+                mini_batch["memory"][i]
+                for i in range(
+                    0, len(mini_batch["memory"]), self.policy.sequence_length
+                )
+            ]
+            feed_dict[self.memory_in] = self._make_zero_mem(
+                self.m_size, mini_batch.num_experiences
+            )
+        return feed_dict
+
+    def _create_cc_critic_old(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Continuous control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        self.old_log_probs = tf.reduce_sum(
+            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
+        )
+
+    def _create_dc_critic_old(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Discrete control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        hidden_stream = ModelUtils.create_observation_streams(
+            self.policy.visual_in,
+            self.policy.processed_vector_in,
+            1,
+            h_size,
+            num_layers,
+            vis_encode_type,
+        )[0]
+
+        if self.policy.use_recurrent:
+            hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
+                hidden_stream,
+                self.memory_in,
+                self.policy.sequence_length_ph,
+                name="lstm_value",
+            )
+            self.memory_out = memory_value_out
+        else:
+            hidden_value = hidden_stream
+
+        self.value_heads, self.value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+
+        self.all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        # Break old log probs into separate branches
+        old_log_prob_branches = ModelUtils.break_into_branches(
+            self.all_old_log_probs, self.policy.act_size
+        )
+
+        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            old_log_prob_branches, self.policy.action_masks, self.policy.act_size
+        )
+
+        action_idx = [0] + list(np.cumsum(self.policy.act_size))
+
+        self.old_log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.policy.selected_actions[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                            logits=old_normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.policy.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
+
+    
--- a/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
+# # Unity ML-Agents Toolkit
+# ## ML-Agent Learning (PPO)
+# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
+
+from collections import defaultdict
+from typing import cast
+
+import numpy as np
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.policy.nn_policy import NNPolicy
+from mlagents.trainers.policy.transfer_policy import TransferPolicy
+from mlagents.trainers.trainer.rl_trainer import RLTrainer
+from mlagents.trainers.brain import BrainParameters
+from mlagents.trainers.policy.tf_policy import TFPolicy
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.ppo_transfer.optimizer import PPOTransferOptimizer
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
+
+BUFFER_TRUNCATE_PERCENT = 0.6
+logger = get_logger(__name__)
+
+
+class PPOTransferTrainer(RLTrainer):
+    """The PPOTrainer is an implementation of the PPO algorithm."""
+
+    def __init__(
+        self,
+        brain_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training PPO model.
+        :param brain_name: The name of the brain associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super(PPOTransferTrainer, self).__init__(
+            brain_name, trainer_settings, training, artifact_path, reward_buff_cap
+        )
+        self.hyperparameters: PPOSettings = cast(
+            PPOSettings, self.trainer_settings.hyperparameters
+        )
+        self.load = load
+        self.seed = seed
+        self.policy: TransferPolicy = None  # type: ignore
+        self.off_policy_buffer: AgentBuffer = AgentBuffer()
+        self.use_iealter = False
+        print("The current algorithm is PPO Transfer")
+
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Takes a trajectory and processes it, putting it into the update buffer.
+        Processing involves calculating value and advantage targets for model updating step.
+        :param trajectory: The Trajectory tuple containing the steps to be processed.
+        """
+        super()._process_trajectory(trajectory)
+        agent_id = trajectory.agent_id  # All the agents should have the same ID
+
+        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        # Update the normalization
+        if self.is_training:
+            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
+
+        # Get all value estimates
+        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
+            agent_buffer_trajectory,
+            trajectory.next_obs,
+            trajectory.done_reached and not trajectory.interrupted,
+        )
+        for name, v in value_estimates.items():
+            agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
+            self._stats_reporter.add_stat(
+                self.optimizer.reward_signals[name].value_name, np.mean(v)
+            )
+
+        # Evaluate all reward functions
+        self.collected_rewards["environment"][agent_id] += np.sum(
+            agent_buffer_trajectory["environment_rewards"]
+        )
+        for name, reward_signal in self.optimizer.reward_signals.items():
+            evaluate_result = reward_signal.evaluate_batch(
+                agent_buffer_trajectory
+            ).scaled_reward
+            agent_buffer_trajectory["{}_rewards".format(name)].extend(evaluate_result)
+            # Report the reward signals
+            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+
+        # Compute GAE and returns
+        tmp_advantages = []
+        tmp_returns = []
+        for name in self.optimizer.reward_signals:
+            bootstrap_value = value_next[name]
+
+            local_rewards = agent_buffer_trajectory[
+                "{}_rewards".format(name)
+            ].get_batch()
+            local_value_estimates = agent_buffer_trajectory[
+                "{}_value_estimates".format(name)
+            ].get_batch()
+            local_advantage = get_gae(
+                rewards=local_rewards,
+                value_estimates=local_value_estimates,
+                value_next=bootstrap_value,
+                gamma=self.optimizer.reward_signals[name].gamma,
+                lambd=self.hyperparameters.lambd,
+            )
+            local_return = local_advantage + local_value_estimates
+            # This is later use as target for the different value estimates
+            agent_buffer_trajectory["{}_returns".format(name)].set(local_return)
+            agent_buffer_trajectory["{}_advantage".format(name)].set(local_advantage)
+            tmp_advantages.append(local_advantage)
+            tmp_returns.append(local_return)
+
+        # Get global advantages
+        global_advantages = list(
+            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
+        )
+        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
+        agent_buffer_trajectory["advantages"].set(global_advantages)
+        agent_buffer_trajectory["discounted_returns"].set(global_returns)
+
+        # Append to update buffer
+        agent_buffer_trajectory.resequence_and_append(
+            self.update_buffer, training_length=self.policy.sequence_length
+        )
+        # the off-policy buffer
+        if self.use_iealter:
+            agent_buffer_trajectory.resequence_and_append(
+                self.off_policy_buffer, training_length=self.policy.sequence_length
+            )
+
+        # If this was a terminal trajectory, append stats and reset reward collection
+        if trajectory.done_reached:
+            self._update_end_episode_stats(agent_id, self.optimizer)
+
+    def _is_ready_update(self):
+        """
+        Returns whether or not the trainer has enough elements to run update model
+        :return: A boolean corresponding to whether or not update_model() can be run
+        """
+        if self.use_iealter:
+            size_of_buffer = self.off_policy_buffer.num_experiences
+            return size_of_buffer > self.hyperparameters.buffer_size
+        else:
+            size_of_buffer = self.update_buffer.num_experiences
+            return size_of_buffer > self.hyperparameters.buffer_size
+
+    def _update_policy(self):
+        """
+        Uses demonstration_buffer to update the policy.
+        The reward signal generators must be updated in this method at their own pace.
+        """
+        if self.use_iealter:
+            self._update_model()
+            if self.update_buffer.num_experiences < self.hyperparameters.buffer_size:
+                return True
+            
+        buffer_length = self.update_buffer.num_experiences
+        self.cumulative_returns_since_policy_update.clear()
+
+        # Make sure batch_size is a multiple of sequence length. During training, we
+        # will need to reshape the data into a batch_size x sequence_length tensor.
+        batch_size = (
+            self.hyperparameters.batch_size
+            - self.hyperparameters.batch_size % self.policy.sequence_length
+        )
+        # Make sure there is at least one sequence
+        batch_size = max(batch_size, self.policy.sequence_length)
+
+        n_sequences = max(
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+        )
+
+        advantages = self.update_buffer["advantages"].get_batch()
+        self.update_buffer["advantages"].set(
+            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+        )
+        num_epoch = self.hyperparameters.num_epoch
+        batch_update_stats = defaultdict(list)
+        for _ in range(num_epoch):
+            self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
+            buffer = self.update_buffer
+            max_num_batch = buffer_length // batch_size
+            for i in range(0, max_num_batch * batch_size, batch_size):
+                update_stats = self.optimizer.update(
+                    buffer.make_mini_batch(i, i + batch_size), n_sequences
+                )
+                for stat_name, value in update_stats.items():
+                    batch_update_stats[stat_name].append(value)
+
+        for stat, stat_list in batch_update_stats.items():
+            self._stats_reporter.add_stat(stat, np.mean(stat_list))
+
+        if self.optimizer.bc_module:
+            update_stats = self.optimizer.bc_module.update()
+            for stat, val in update_stats.items():
+                self._stats_reporter.add_stat(stat, val)
+        self._clear_update_buffer()
+        
+        return True
+    
+    def _update_model(self):
+        """
+        Uses demonstration_buffer to update the policy.
+        The reward signal generators must be updated in this method at their own pace.
+        """
+        buffer_length = self.off_policy_buffer.num_experiences
+        self.cumulative_returns_since_policy_update.clear()
+
+        # Make sure batch_size is a multiple of sequence length. During training, we
+        # will need to reshape the data into a batch_size x sequence_length tensor.
+        batch_size = (
+            self.hyperparameters.batch_size
+            - self.hyperparameters.batch_size % self.policy.sequence_length
+        )
+        # Make sure there is at least one sequence
+        batch_size = max(batch_size, self.policy.sequence_length)
+
+        n_sequences = max(
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+        )
+
+        advantages = self.off_policy_buffer["advantages"].get_batch()
+        self.off_policy_buffer["advantages"].set(
+            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+        )
+        num_epoch = self.hyperparameters.num_epoch
+        batch_update_stats = defaultdict(list)
+        for _ in range(num_epoch):
+            self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
+            buffer = self.off_policy_buffer
+            max_num_batch = buffer_length // batch_size
+            for i in range(0, max_num_batch * batch_size, batch_size):
+                update_stats = self.optimizer.update_part(
+                    buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
+                )
+                for stat_name, value in update_stats.items():
+                    batch_update_stats[stat_name].append(value)
+        for stat, stat_list in batch_update_stats.items():
+            self._stats_reporter.add_stat(stat, np.mean(stat_list))
+
+        if self.optimizer.bc_module:
+            update_stats = self.optimizer.bc_module.update()
+            for stat, val in update_stats.items():
+                self._stats_reporter.add_stat(stat, val)
+        
+        # self.off_policy_buffer.reset_agent()
+        if self.off_policy_buffer.num_experiences > self.hyperparameters.buffer_size:
+            self.off_policy_buffer.truncate(
+                int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
+            )
+        
+        return True
+
+    def create_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
+    ) -> TFPolicy:
+        """
+        Creates a PPO policy to trainers list of policies.
+        :param brain_parameters: specifications for policy construction
+        :return policy
+        """
+        policy = TransferPolicy(
+            self.seed,
+            brain_parameters,
+            self.trainer_settings,
+            self.is_training,
+            self.artifact_path,
+            self.load,
+            condition_sigma_on_obs=False,  # Faster training for PPO
+            create_tf_graph=False,  # We will create the TF graph in the Optimizer
+        )
+
+        return policy
+
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
+    ) -> None:
+        """
+        Adds policy to trainer.
+        :param parsed_behavior_id: Behavior identifiers that the policy should belong to.
+        :param policy: Policy to associate with name_behavior_id.
+        """
+        if self.policy:
+            logger.warning(
+                "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
+                    train adversarial games.".format(
+                    self.__class__.__name__
+                )
+            )
+        if not isinstance(policy, TransferPolicy):
+            raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
+        self.policy = policy
+        self.optimizer = PPOTransferOptimizer(self.policy, self.trainer_settings)
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+        # Needed to resume loads properly
+        self.step = policy.get_current_step()
+
+    def get_policy(self, name_behavior_id: str) -> TFPolicy:
+        """
+        Gets policy from trainer associated with name_behavior_id
+        :param name_behavior_id: full identifier of policy
+        """
+
+        return self.policy
+
+
+def discount_rewards(r, gamma=0.99, value_next=0.0):
+    """
+    Computes discounted sum of future rewards for use in updating value estimate.
+    :param r: List of rewards.
+    :param gamma: Discount factor.
+    :param value_next: T+1 value estimate for returns calculation.
+    :return: discounted sum of future rewards as list.
+    """
+    discounted_r = np.zeros_like(r)
+    running_add = value_next
+    for t in reversed(range(0, r.size)):
+        running_add = running_add * gamma + r[t]
+        discounted_r[t] = running_add
+    return discounted_r
+
+
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+    """
+    Computes generalized advantage estimate for use in updating policy.
+    :param rewards: list of rewards for time-steps t to T.
+    :param value_next: Value estimate for time-step T+1.
+    :param value_estimates: list of value estimates for time-steps t to T.
+    :param gamma: Discount factor.
+    :param lambd: GAE weighing factor.
+    :return: list of advantage estimates for time-steps t to T.
+    """
+    value_estimates = np.append(value_estimates, value_next)
+    delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
+    advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
+    return advantage
--- a/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!134 &13400000
+PhysicMaterial:
+  m_ObjectHideFlags: 0
+  m_CorrespondingSourceObject: {fileID: 0}
+  m_PrefabInstance: {fileID: 0}
+  m_PrefabAsset: {fileID: 0}
+  m_Name: Floor
+  dynamicFriction: 0.6
+  staticFriction: 0.6
+  bounciness: 0
+  frictionCombine: 0
+  bounceCombine: 0
--- a/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial.meta
+++ b/Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial.meta
+fileFormatVersion: 2
+guid: dff6e5680d76643a481e8d81555ef3ee
+NativeFormatImporter:
+  externalObjects: {}
+  mainObjectFileID: 13400000
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: