Develop model transfer test (#4214)

* test env, and code integration * delete results
4 年前 · 839eb2cb
--- a/ml-agents/mlagents/trainers/policy/transfer_policy.py
+++ b/ml-agents/mlagents/trainers/policy/transfer_policy.py
                # We assume the first thing created in the graph is the Policy. If
                # already populated, don't create more tensors.
                return
-
            self.create_input_placeholders()
            self.current_action = tf.placeholder(
                    shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"

        # slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)
    
-    def load_graph_partial(self, path: str, transfer_type="dynamics"):
-        load_nets = {"dynamics": ["predict"], 
+    def load_graph_partial(self, path: str, transfer_type="dynamics", load_model=True, load_policy=True,
+        load_value=True):
+        load_nets = {"dynamics": [], 
+        if load_model:
+            load_nets["dynamics"].append("predict")
+        if load_policy:
+            load_nets["dynamics"].append("policy")
+        if load_value:
+            load_nets["dynamics"].append("value")
+            
        with self.graph.as_default():
            for net in load_nets[transfer_type]:
                variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net)
                self.h_size
                * (self.vis_obs_size + int(self.vec_obs_size > 0)),
                name="hidden_{}".format(i),
-                activation=ModelUtils.swish,
-                kernel_initializer=tf.initializers.variance_scaling(1.0),
+                # activation=ModelUtils.swish,
+                # kernel_initializer=tf.initializers.variance_scaling(1.0),
            )

        if var_predict:
                hidden,
                self.feature_size,
                name="latent",
-                activation=ModelUtils.swish,
-                kernel_initializer=tf.initializers.variance_scaling(1.0),
+                # activation=ModelUtils.swish,
+                # kernel_initializer=tf.initializers.variance_scaling(1.0),
            )

        squared_difference = 0.5 * tf.reduce_sum(
                self.h_size
                * (self.vis_obs_size + int(self.vec_obs_size > 0)),
                name="hidden_{}".format(i),
-                activation=ModelUtils.swish,
-                kernel_initializer=tf.initializers.variance_scaling(1.0),
+                # activation=ModelUtils.swish,
+                # kernel_initializer=tf.initializers.variance_scaling(1.0),
-            activation=ModelUtils.swish,
-            kernel_initializer=tf.initializers.variance_scaling(1.0),
+            # activation=ModelUtils.swish,
+            # kernel_initializer=tf.initializers.variance_scaling(1.0),
        )
        self.reward_loss = tf.reduce_mean(
            tf.squared_difference(self.pred_reward, self.current_reward)
--- a/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
        self.in_batch_alter = hyperparameters.in_batch_alter
        self.in_epoch_alter = hyperparameters.in_epoch_alter
        self.op_buffer = hyperparameters.use_op_buffer
-        
-        self.train_type = hyperparameters.train_type
-        
+        self.train_encoder = hyperparameters.train_encoder
+        self.train_model = hyperparameters.train_model
+        self.train_policy = hyperparameters.train_policy
+        self.train_value = hyperparameters.train_value
+
        # Transfer
        self.use_transfer = hyperparameters.use_transfer
        self.transfer_path = hyperparameters.transfer_path #"results/BallSingle_nosep_cmodel_small/3DBall"

            with tf.variable_scope("value"):
                if policy.use_continuous_act:
-                    self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
+                    if hyperparameters.separate_value_net:
+                        self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
+                    else:
+                        self._create_cc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
-                    self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
+                    if hyperparameters.separate_value_net:
+                        self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
+                    else:
+                        self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
            
            with tf.variable_scope("optimizer/"):
                self.learning_rate = ModelUtils.create_schedule(
            
            self.policy.initialize_or_load()
            if self.use_transfer:
-                self.policy.load_graph_partial(self.transfer_path, self.transfer_type)
+                self.policy.load_graph_partial(self.transfer_path, self.transfer_type, 
+                    hyperparameters.load_model, hyperparameters.load_policy, hyperparameters.load_value)
            self.policy.get_encoder_weights()
            self.policy.get_policy_weights()

        )

    def _create_ppo_optimizer_ops(self):
-        if self.use_transfer:
-            if self.transfer_type == "dynamics":
-                if self.train_type == "all":
-                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
-                elif self.train_type == "encoding":
-                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
-                elif self.train_type == "policy":
-                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
-                    train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
-                    train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
-                print("trainable", train_vars)
-                # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
-            elif self.transfer_type == "observation":
-                if self.train_type == "all":
-                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
-                elif self.train_type == "policy":
-                    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
-                        + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
-                        + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
-                        + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") 
-                        # + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
-        else:
-            train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
-            print("trainable", train_vars)
+        # if self.use_transfer:
+        #     if self.transfer_type == "dynamics":
+        #         if self.train_type == "all":
+        #             train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+        #         elif self.train_type == "encoding":
+        #             train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+        #             # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+        #         elif self.train_type == "policy":
+        #             train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+        #             train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+        #             train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+        #         print("trainable", train_vars)
+        #         # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+        #         # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+        #         # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+        #         # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
+        #         # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
+        #         # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
+        #     elif self.transfer_type == "observation":
+        #         if self.train_type == "all":
+        #             train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+        #         elif self.train_type == "policy":
+        #             train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+        #                 + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+        #                 + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+        #                 + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") 
+        #                 # + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
+        # else:
+        #     train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+        train_vars = []
+        if self.train_encoder:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+        if self.train_model:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
+        if self.train_policy:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+        if self.train_value:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
+        print("trainable", train_vars)

        self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
        self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)
    def _init_alter_update(self):
        
-        if self.use_alter:
-            policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
-            policy_train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
-            model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
-            self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
-            self.ppo_grads = self.ppo_optimizer.compute_gradients(self.loss, var_list=policy_train_vars)
-            self.ppo_update_batch = self.ppo_optimizer.minimize(self.loss, var_list=policy_train_vars)
-
-            self.model_optimizer = self.create_optimizer_op(self.learning_rate)
-            self.model_grads = self.model_optimizer.compute_gradients(self.loss, var_list=model_train_vars)
-            self.model_update_batch = self.model_optimizer.minimize(self.loss, var_list=model_train_vars)
-        else:
-            if self.train_type == "all":
-                train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
-            elif self.train_type == "encoding":
-                train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
-                # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
-            elif self.train_type == "policy":
-                train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
-                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
-                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
-                    + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") 
+        train_vars = []
+        if self.train_encoder:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
+        if self.train_model:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
+        if self.train_policy:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+        if self.train_value:
+            train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
-            self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
-            self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
-            self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
+        self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
+        self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
+        self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
-            self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
-            self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
-            self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
+        self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
+        self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
+        self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)

        self.ppo_update_dict.update(
            {
            }
        )
        
-
        self.model_update_dict.update(
            {
                "model_loss": self.model_loss,
        for stat_name, update_name in stats_needed.items():
            if update_name in update_vals.keys():
                update_stats[stat_name] = update_vals[update_name]
-
+        
        self.num_updates += 1
        return update_stats

            feed_dict[self.memory_in] = self._make_zero_mem(
                self.m_size, mini_batch.num_experiences
            )
+        # print(self.policy.sess.run(self.policy.encoder, feed_dict={self.policy.vector_in: mini_batch["vector_obs"]}))
        return feed_dict

    def _create_cc_critic_old(
--- a/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
                buffer = self.update_buffer
                max_num_batch = buffer_length // batch_size
                for i in range(0, max_num_batch * batch_size, batch_size):
-                    update_stats = self.optimizer.update(
-                        buffer.make_mini_batch(i, i + batch_size), n_sequences
+                    update_stats = self.optimizer.update_part(
+                        buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
                    )
                    for stat_name, value in update_stats.items():
                        batch_update_stats[stat_name].append(value)
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py

    separate_value_train: bool = False
    separate_policy_train: bool = False
-    seprate_value_net: bool = False
+    separate_value_net: bool = False
    use_var_encoder: bool = False
    use_var_predict: bool = False
    with_prior: bool = False
    in_batch_alter: bool = False
    in_epoch_alter: bool = False
    use_op_buffer: bool = False
-    train_type: str = "all"
+    train_encoder: bool = True
+    train_model: bool = True
+    train_policy: bool = True
+    train_value: bool = True
    feature_size: int = 16
        
    # Transfer
    transfer_path: str = ""
    transfer_type: str = "dynamics"
+    load_model: bool = True
+    load_value: bool = True
+    load_policy: bool = True

    # Network
    encoder_layers: int = 1
--- a/ml-agents/mlagents/trainers/tests/test_simple_transfer.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_transfer.py
+import math
+import tempfile
+import pytest
+import numpy as np
+import attr
+from typing import Dict
+
+from mlagents.trainers.tests.transfer_test_envs import SimpleTransferEnvironment
+from mlagents.trainers.trainer_controller import TrainerController
+from mlagents.trainers.trainer_util import TrainerFactory
+from mlagents.trainers.simple_env_manager import SimpleEnvManager
+from mlagents.trainers.demo_loader import write_demo
+from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary, TensorboardWriter, CSVWriter
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    PPOSettings,
+    PPOTransferSettings,
+    SACSettings,
+    NetworkSettings,
+    SelfPlaySettings,
+    BehavioralCloningSettings,
+    GAILSettings,
+    TrainerType,
+    RewardSignalType,
+)
+from mlagents.trainers.models import EncoderType, ScheduleType
+from mlagents_envs.side_channel.environment_parameters_channel import (
+    EnvironmentParametersChannel,
+)
+from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
+    DemonstrationMetaProto,
+)
+from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
+from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
+
+BRAIN_NAME = "Simple"
+
+
+PPO_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.PPO,
+    hyperparameters=PPOSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=16,
+        buffer_size=64,
+    ),
+    network_settings=NetworkSettings(num_layers=2, hidden_units=32),
+    summary_freq=500,
+    max_steps=3000,
+    threaded=False,
+)
+
+SAC_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.SAC,
+    hyperparameters=SACSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=8,
+        buffer_init_steps=100,
+        buffer_size=5000,
+        tau=0.01,
+        init_entcoef=0.01,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=16),
+    summary_freq=100,
+    max_steps=1000,
+    threaded=False,
+)
+
+Transfer_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.PPO_Transfer,
+    hyperparameters=PPOTransferSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=16,
+        buffer_size=64,
+        feature_size=2,
+        reuse_encoder=True,
+        in_epoch_alter=True,
+        in_batch_alter=False,
+        use_op_buffer=True,
+        policy_layers=1
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=32),
+    summary_freq=500,
+    max_steps=3000,
+    threaded=False,
+)
+
+
+
+# The reward processor is passed as an argument to _check_environment_trains.
+# It is applied to the list pf all final rewards for each brain individually.
+# This is so that we can process all final rewards in different ways for different algorithms.
+# Custom reward processors shuld be built within the test function and passed to _check_environment_trains
+# Default is average over the last 5 final rewards
+def default_reward_processor(rewards, last_n_rewards=5):
+    rewards_to_use = rewards[-last_n_rewards:]
+    # For debugging tests
+    print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
+    return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
+
+
+class DebugWriter(StatsWriter):
+    """
+    Print to stdout so stats can be viewed in pytest
+    """
+
+    def __init__(self):
+        self._last_reward_summary: Dict[str, float] = {}
+        self.stats = {}
+
+    def get_last_rewards(self):
+        return self._last_reward_summary
+
+    def write_stats(
+        self, category: str, values: Dict[str, StatsSummary], step: int
+    ) -> None:
+        for val, stats_summary in values.items():
+            if val == "Environment/Cumulative Reward":
+                print(step, val, stats_summary.mean)
+                self.stats[step] = stats_summary.mean
+                self._last_reward_summary[category] = stats_summary.mean
+    
+    def write2file(self, filename):
+        with open(filename, "w") as reward_file:
+            for step in self.stats.keys():
+                reward_file.write(str(step) + ":" + str(self.stats[step]) + "\n")
+
+
+def _check_environment_trains(
+    env,
+    trainer_config,
+    reward_processor=default_reward_processor,
+    meta_curriculum=None,
+    success_threshold=0.9,
+    env_manager=None,
+    run_id="id",
+    seed=1337
+):
+    # Create controller and begin training.
+    model_dir = "./transfer_results/" + run_id
+    StatsReporter.writers.clear()  # Clear StatsReporters so we don't write to file
+    debug_writer = DebugWriter()
+    StatsReporter.add_writer(debug_writer)
+
+    csv_writer = CSVWriter(
+        model_dir,
+        required_fields=[
+            "Environment/Cumulative Reward",
+            "Environment/Episode Length",
+        ],
+    )
+    tb_writer = TensorboardWriter(
+        model_dir, clear_past_data=True
+    )
+    StatsReporter.add_writer(tb_writer)
+    StatsReporter.add_writer(csv_writer)
+
+    if env_manager is None:
+        env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
+    trainer_factory = TrainerFactory(
+        trainer_config=trainer_config,
+        output_path=model_dir,
+        train_model=True,
+        load_model=False,
+        seed=seed,
+        meta_curriculum=meta_curriculum,
+        multi_gpu=False,
+    )
+
+    tc = TrainerController(
+        trainer_factory=trainer_factory,
+        output_path=model_dir,
+        run_id=run_id,
+        meta_curriculum=meta_curriculum,
+        train=True,
+        training_seed=seed,
+    )
+
+    # Begin training
+    tc.start_learning(env_manager)
+    # debug_writer.write2file(model_dir+"/reward.txt")
+
+    if (
+        success_threshold is not None
+    ):  # For tests where we are just checking setup and not reward
+        processed_rewards = [
+            reward_processor(rewards) for rewards in env.final_rewards.values()
+        ]
+        assert all(not math.isnan(reward) for reward in processed_rewards)
+        assert all(reward > success_threshold for reward in processed_rewards)
+
+
+def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich", run_id="modelbased_rich_5e-4", seed=1337):
+    env = SimpleTransferEnvironment(
+        [BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
+    )
+    new_hyperparams = attr.evolve(
+        config.hyperparameters, batch_size=64, buffer_size=640, learning_rate=5.0e-4,
+    )
+    config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
+    _check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
+
+def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich", run_id="transfer_rich_iealter_retrain-enc_5e-4", seed=1337):
+    env = SimpleTransferEnvironment(
+        [BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
+    )
+    new_hyperparams = attr.evolve(
+        config.hyperparameters, batch_size=64, buffer_size=640, use_transfer=True,
+        transfer_path="./transfer_results/modelbased_normal_opbuf_ibalter_s2/Simple",
+        use_op_buffer=True, in_epoch_alter=True, learning_rate=5.0e-4, train_policy=False,
+        train_value=False, train_model=False
+    )
+    config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
+    _check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
+
+
+if __name__ == "__main__":
+    # test_2d_model(seed=0)
+    test_2d_transfer(seed=0)
+    # for i in range(5):
+    #     test_2d_model(seed=i)
--- a/ml-agents/mlagents/trainers/tests/transfer_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/transfer_test_envs.py
+import random
+from typing import Dict, List, Any, Tuple
+import numpy as np
+
+from mlagents_envs.base_env import (
+    BaseEnv,
+    BehaviorSpec,
+    DecisionSteps,
+    TerminalSteps,
+    ActionType,
+    BehaviorMapping,
+)
+from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action
+from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
+    AgentInfoActionPairProto,
+)
+
+OBS_SIZE = 1
+VIS_OBS_SIZE = (20, 20, 3)
+STEP_SIZE = 0.1
+
+TIME_PENALTY = 0.01
+MIN_STEPS = int(1.0 / STEP_SIZE) + 1
+SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
+
+
+def clamp(x, min_val, max_val):
+    return max(min_val, min(x, max_val))
+
+
+class SimpleTransferEnvironment(BaseEnv):
+    """
+    Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
+    it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
+    """
+
+    def __init__(
+        self,
+        brain_names,
+        use_discrete,
+        step_size=STEP_SIZE,
+        num_visual=0,
+        num_vector=1,
+        vis_obs_size=VIS_OBS_SIZE,
+        vec_obs_size=OBS_SIZE,
+        action_size=1,
+        obs_spec_type="normal" # normal: (x,y); rich: (x+y, x-y, x*y)
+    ):
+        super().__init__()
+        self.discrete = use_discrete
+        self.num_visual = num_visual
+        self.num_vector = num_vector
+        self.vis_obs_size = vis_obs_size
+        self.vec_obs_size = vec_obs_size
+        self.obs_spec_type = obs_spec_type
+        action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
+        self.behavior_spec = BehaviorSpec(
+            self._make_obs_spec(),
+            action_type,
+            tuple(2 for _ in range(action_size)) if use_discrete else action_size,
+        )
+        self.action_size = action_size
+        self.names = brain_names
+        self.positions: Dict[str, List[float]] = {}
+        self.step_count: Dict[str, float] = {}
+        self.random = random.Random(str(self.behavior_spec))
+        self.goal: Dict[str, int] = {}
+        self.action = {}
+        self.rewards: Dict[str, float] = {}
+        self.final_rewards: Dict[str, List[float]] = {}
+        self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
+        self.agent_id: Dict[str, int] = {}
+        self.step_size = step_size  # defines the difficulty of the test
+
+        for name in self.names:
+            self.agent_id[name] = 0
+            self.goal[name] = self.random.choice([-1, 1])
+            self.rewards[name] = 0
+            self.final_rewards[name] = []
+            self._reset_agent(name)
+            self.action[name] = None
+            self.step_result[name] = None
+
+    def _make_obs_spec(self) -> List[Any]:
+        obs_spec: List[Any] = []
+        # goal
+        for _ in range(self.num_vector):
+            obs_spec.append((self.vec_obs_size,))
+        for _ in range(self.num_visual):
+            obs_spec.append(self.vis_obs_size)
+        # position
+        if self.obs_spec_type == "normal":
+            for _ in range(self.num_vector):
+                obs_spec.append((self.vec_obs_size,))
+        # composed position
+        if self.obs_spec_type == "rich":
+            for _ in range(self.num_vector+1):
+                obs_spec.append((self.vec_obs_size,))
+        print("obs_spec:", obs_spec)
+        return obs_spec
+
+    def _make_obs(self, value: float) -> List[np.ndarray]:
+        obs = []
+        for _ in range(self.num_vector):
+            obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value)
+        if self.obs_spec_type == "normal":
+            for name in self.names:
+                for i in self.positions[name]:
+                    obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * i)
+        elif self.obs_spec_type == "rich":
+            for name in self.names:
+                i = self.positions[name][0]
+                j = self.positions[name][1]
+                obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i+j))
+                obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i-j))
+                obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j))
+        for _ in range(self.num_visual):
+            obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
+        return obs
+
+    @property
+    def behavior_specs(self):
+        behavior_dict = {}
+        for n in self.names:
+            behavior_dict[n] = self.behavior_spec
+        return BehaviorMapping(behavior_dict)
+
+    def set_action_for_agent(self, behavior_name, agent_id, action):
+        pass
+
+    def set_actions(self, behavior_name, action):
+        self.action[behavior_name] = action
+
+    def get_steps(self, behavior_name):
+        return self.step_result[behavior_name]
+
+    def _take_action(self, name: str) -> bool:
+        deltas = []
+        for _act in self.action[name][0]:
+            if self.discrete:
+                deltas.append(1 if _act else -1)
+            else:
+                deltas.append(_act)
+        for i, _delta in enumerate(deltas):
+            _delta = clamp(_delta, -self.step_size, self.step_size)
+            self.positions[name][i] += _delta
+            self.positions[name][i] = clamp(self.positions[name][i], -1, 1)
+            self.step_count[name] += 1
+            # Both must be in 1.0 to be done
+        done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
+        # print(self.positions)
+        return done
+
+    def _generate_mask(self):
+        if self.discrete:
+            # LL-Python API will return an empty dim if there is only 1 agent.
+            ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
+            ndmask = np.expand_dims(ndmask, axis=0)
+            action_mask = [ndmask]
+        else:
+            action_mask = None
+        return action_mask
+
+    def _compute_reward(self, name: str, done: bool) -> float:
+        if done:
+            reward = 0.0
+            for _pos in self.positions[name]:
+                reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
+                    self.positions[name]
+                )
+        else:
+            reward = -TIME_PENALTY
+        return reward
+
+    def _reset_agent(self, name):
+        self.goal[name] = self.random.choice([-1, 1])
+        self.positions[name] = [0.0 for _ in range(self.action_size)]
+        self.step_count[name] = 0
+        self.rewards[name] = 0
+        self.agent_id[name] = self.agent_id[name] + 1
+
+    def _make_batched_step(
+        self, name: str, done: bool, reward: float
+    ) -> Tuple[DecisionSteps, TerminalSteps]:
+        m_vector_obs = self._make_obs(self.goal[name])
+        m_reward = np.array([reward], dtype=np.float32)
+        m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
+        action_mask = self._generate_mask()
+        decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
+        terminal_step = TerminalSteps.empty(self.behavior_spec)
+        if done:
+            self.final_rewards[name].append(self.rewards[name])
+            self._reset_agent(name)
+            new_vector_obs = self._make_obs(self.goal[name])
+            (
+                new_reward,
+                new_done,
+                new_agent_id,
+                new_action_mask,
+            ) = self._construct_reset_step(name)
+
+            decision_step = DecisionSteps(
+                new_vector_obs, new_reward, new_agent_id, new_action_mask
+            )
+            terminal_step = TerminalSteps(
+                m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
+            )
+        return (decision_step, terminal_step)
+
+    def _construct_reset_step(
+        self, name: str
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        new_reward = np.array([0.0], dtype=np.float32)
+        new_done = np.array([False], dtype=np.bool)
+        new_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
+        new_action_mask = self._generate_mask()
+        return new_reward, new_done, new_agent_id, new_action_mask
+
+    def step(self) -> None:
+        assert all(action is not None for action in self.action.values())
+        for name in self.names:
+
+            done = self._take_action(name)
+            reward = self._compute_reward(name, done)
+            self.rewards[name] += reward
+            self.step_result[name] = self._make_batched_step(name, done, reward)
+
+    def reset(self) -> None:  # type: ignore
+        for name in self.names:
+            self._reset_agent(name)
+            self.step_result[name] = self._make_batched_step(name, False, 0.0)
+
+    @property
+    def reset_parameters(self) -> Dict[str, str]:
+        return {}
+
+    def close(self):
+        pass