Merge branch 'self-play-mutex' into soccer-2v1

5 年前 · a870d453
--- a/.yamato/training-int-tests.yml
+++ b/.yamato/training-int-tests.yml
  commands:
    - pip install pyyaml
    - python -u -m ml-agents.tests.yamato.training_int_tests
+    # Backwards-compatibility tests.
+    # If we make a breaking change to the communication protocol, these will need
+    # to be disabled until the next release.
+    - python -u -m ml-agents.tests.yamato.training_int_tests --python=0.15.0
+    - python -u -m ml-agents.tests.yamato.training_int_tests --csharp=0.15.0
  triggers:
    cancel_old_ci: true
    changes:
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 - Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
 - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
 - The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
+ - Fixed the reported entropy values for continuous actions (#3684)
+ - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)
+ - Fixed an issue where exceptions from environments provided a returncode of 0. (#3680)

 ## [0.15.0-preview] - 2020-03-18
 ### Major Changes
--- a/gym-unity/gym_unity/envs/init.py
+++ b/gym-unity/gym_unity/envs/init.py

    def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult:
        n_extra_agents = step_result.n_agents() - self._n_agents
-        if n_extra_agents < 0 or n_extra_agents > self._n_agents:
+        if n_extra_agents < 0:
-            # or too many requested a decision
            raise UnityGymException(
                "The number of agents in the scene does not match the expected number."
            )
        # only cares about the ordering.
        for index, agent_id in enumerate(step_result.agent_id):
            if not self._previous_step_result.contains_agent(agent_id):
+                if step_result.done[index]:
+                    # If the Agent is already done (e.g. it ended its epsiode twice in one step)
+                    # Don't try to register it here.
+                    continue
                # Register this agent, and get the reward of the previous agent that
                # was in its index, so that we can return it to the gym.
                last_reward = self.agent_mapper.register_new_agent_id(agent_id)
        """
        Declare the agent done with the corresponding final reward.
        """
-        gym_index = self._agent_id_to_gym_index.pop(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
+        if agent_id in self._agent_id_to_gym_index:
+            gym_index = self._agent_id_to_gym_index.pop(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+        else:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass

    def register_new_agent_id(self, agent_id: int) -> float:
        """
        self._gym_id_order = list(agent_ids)

    def mark_agent_done(self, agent_id: int, reward: float) -> None:
-        gym_index = self._gym_id_order.index(agent_id)
-        self._done_agents_index_to_last_reward[gym_index] = reward
-        self._gym_id_order[gym_index] = -1
+        try:
+            gym_index = self._gym_id_order.index(agent_id)
+            self._done_agents_index_to_last_reward[gym_index] = reward
+            self._gym_id_order[gym_index] = -1
+        except ValueError:
+            # Agent was never registered in the first place (e.g. EndEpisode called multiple times)
+            pass

    def register_new_agent_id(self, agent_id: int) -> float:
        original_index = self._gym_id_order.index(-1)
--- a/gym-unity/gym_unity/tests/test_gym.py
+++ b/gym-unity/gym_unity/tests/test_gym.py
        assert expected_agent_id == agent_id


+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_new_agent_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=3)
+    mock_step.agent_id = np.array(range(5))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=True)
+
+    received_step_result = create_mock_vector_step_result(num_agents=7)
+    received_step_result.agent_id = np.array(range(7))
+    # agent #3 (id = 2) is Done
+    # so is the "new" agent (id = 5)
+    done = [False] * 7
+    done[2] = True
+    done[5] = True
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([0, 1, 6, 3, 4], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
+@mock.patch("gym_unity.envs.UnityEnvironment")
+def test_sanitize_action_single_agent_multiple_done(mock_env):
+    mock_spec = create_mock_group_spec(
+        vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3]
+    )
+    mock_step = create_mock_vector_step_result(num_agents=1)
+    mock_step.agent_id = np.array(range(1))
+    setup_mock_unityenvironment(mock_env, mock_spec, mock_step)
+    env = UnityEnv(" ", use_visual=False, multiagent=False)
+
+    received_step_result = create_mock_vector_step_result(num_agents=3)
+    received_step_result.agent_id = np.array(range(3))
+    # original agent (id = 0) is Done
+    # so is the "new" agent (id = 1)
+    done = [True, True, False]
+    received_step_result.done = np.array(done)
+    sanitized_result = env._sanitize_info(received_step_result)
+    for expected_agent_id, agent_id in zip([2], sanitized_result.agent_id):
+        assert expected_agent_id == agent_id
+
+
 # Helper methods


    # Mark some agents as done with their last rewards.
    mapper.mark_agent_done(1001, 42.0)
    mapper.mark_agent_done(1004, 1337.0)
+
+    # Make sure we can handle an unknown agent id being marked done.
+    # This can happen when an agent ends an episode on the same step it starts.
+    mapper.mark_agent_done(9999, -1.0)

    # Now add new agents, and get the rewards of the agent they replaced.
    old_reward1 = mapper.register_new_agent_id(2001)
--- a/ml-agents/mlagents/trainers/behavior_id_utils.py
+++ b/ml-agents/mlagents/trainers/behavior_id_utils.py
        """
        Parses a name_behavior_id of the form name?team=0
        into a BehaviorIdentifiers NamedTuple.
-        This allows you to access the brain name and team id og an agent
+        This allows you to access the brain name and team id of an agent
        :param name_behavior_id: String of behavior params in HTTP format.
        :returns: A BehaviorIdentifiers object.
        """
--- a/ml-agents/mlagents/trainers/distributions.py
+++ b/ml-agents/mlagents/trainers/distributions.py
        self, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        single_dim_entropy = 0.5 * tf.reduce_mean(
-            tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
+            tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
        )
        # Make entropy the right shape
        return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
--- a/ml-agents/mlagents/trainers/sac/optimizer.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer.py
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
-            "entropy": self.policy.entropy,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
 import logging
 from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set, Tuple
 import cloudpickle
+import enum
-from mlagents_envs.exception import UnityCommunicationException, UnityTimeOutException
+from mlagents_envs.exception import (
+    UnityCommunicationException,
+    UnityTimeOutException,
+    UnityEnvironmentException,
+)
 from multiprocessing import Process, Pipe, Queue
 from multiprocessing.connection import Connection
 from queue import Empty as EmptyQueueException
 logger = logging.getLogger("mlagents.trainers")


-class EnvironmentCommand(NamedTuple):
-    name: str
+class EnvironmentCommand(enum.Enum):
+    STEP = 1
+    EXTERNAL_BRAINS = 2
+    GET_PROPERTIES = 3
+    RESET = 4
+    CLOSE = 5
+    ENV_EXITED = 6
+
+
+class EnvironmentRequest(NamedTuple):
+    cmd: EnvironmentCommand
-    name: str
+    cmd: EnvironmentCommand
    worker_id: int
    payload: Any

        self.previous_all_action_info: Dict[str, ActionInfo] = {}
        self.waiting = False

-    def send(self, name: str, payload: Any = None) -> None:
+    def send(self, cmd: EnvironmentCommand, payload: Any = None) -> None:
-            cmd = EnvironmentCommand(name, payload)
-            self.conn.send(cmd)
+            req = EnvironmentRequest(cmd, payload)
+            self.conn.send(req)
        except (BrokenPipeError, EOFError):
            raise UnityCommunicationException("UnityEnvironment worker: send failed.")

+            if response.cmd == EnvironmentCommand.ENV_EXITED:
+                env_exception: Exception = response.payload
+                raise env_exception
            return response
        except (BrokenPipeError, EOFError):
            raise UnityCommunicationException("UnityEnvironment worker: recv failed.")
-            self.conn.send(EnvironmentCommand("close"))
+            self.conn.send(EnvironmentRequest(EnvironmentCommand.CLOSE))
        except (BrokenPipeError, EOFError):
            logger.debug(
                f"UnityEnvWorker {self.worker_id} got exception trying to close."
    engine_configuration_channel = EngineConfigurationChannel()
    engine_configuration_channel.set_configuration(engine_configuration)
    stats_channel = StatsSideChannel()
-    env: BaseEnv = env_factory(
-        worker_id,
-        [shared_float_properties, engine_configuration_channel, stats_channel],
-    )
+    env: BaseEnv = None
-    def _send_response(cmd_name, payload):
+    def _send_response(cmd_name: EnvironmentCommand, payload: Any) -> None:
        parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))

    def _generate_all_results() -> AllStepResult:
        return result

    try:
+        env = env_factory(
+            worker_id,
+            [shared_float_properties, engine_configuration_channel, stats_channel],
+        )
-            cmd: EnvironmentCommand = parent_conn.recv()
-            if cmd.name == "step":
-                all_action_info = cmd.payload
+            req: EnvironmentRequest = parent_conn.recv()
+            if req.cmd == EnvironmentCommand.STEP:
+                all_action_info = req.payload
                for brain_name, action_info in all_action_info.items():
                    if len(action_info.action) != 0:
                        env.set_actions(brain_name, action_info.action)
                step_response = StepResponse(
                    all_step_result, get_timer_root(), env_stats
                )
-                step_queue.put(EnvironmentResponse("step", worker_id, step_response))
+                step_queue.put(
+                    EnvironmentResponse(
+                        EnvironmentCommand.STEP, worker_id, step_response
+                    )
+                )
-            elif cmd.name == "external_brains":
-                _send_response("external_brains", external_brains())
-            elif cmd.name == "get_properties":
+            elif req.cmd == EnvironmentCommand.EXTERNAL_BRAINS:
+                _send_response(EnvironmentCommand.EXTERNAL_BRAINS, external_brains())
+            elif req.cmd == EnvironmentCommand.GET_PROPERTIES:
-                _send_response("get_properties", reset_params)
-            elif cmd.name == "reset":
-                for k, v in cmd.payload.items():
+                _send_response(EnvironmentCommand.GET_PROPERTIES, reset_params)
+            elif req.cmd == EnvironmentCommand.RESET:
+                for k, v in req.payload.items():
-                _send_response("reset", all_step_result)
-            elif cmd.name == "close":
+                _send_response(EnvironmentCommand.RESET, all_step_result)
+            elif req.cmd == EnvironmentCommand.CLOSE:
-    except (KeyboardInterrupt, UnityCommunicationException, UnityTimeOutException):
+    except (
+        KeyboardInterrupt,
+        UnityCommunicationException,
+        UnityTimeOutException,
+        UnityEnvironmentException,
+    ) as ex:
-        step_queue.put(EnvironmentResponse("env_close", worker_id, None))
+        step_queue.put(
+            EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex)
+        )
+        _send_response(EnvironmentCommand.ENV_EXITED, ex)
    finally:
        # If this worker has put an item in the step queue that hasn't been processed by the EnvManager, the process
        # will hang until the item is processed. We avoid this behavior by using Queue.cancel_join_thread()
        step_queue.cancel_join_thread()
        step_queue.close()
-        env.close()
+        if env is not None:
+            env.close()
        logger.debug(f"UnityEnvironment worker {worker_id} done.")


            if not env_worker.waiting:
                env_action_info = self._take_step(env_worker.previous_step)
                env_worker.previous_all_action_info = env_action_info
-                env_worker.send("step", env_action_info)
+                env_worker.send(EnvironmentCommand.STEP, env_action_info)
                env_worker.waiting = True

    def _step(self) -> List[EnvironmentStep]:
        while len(worker_steps) < 1:
            try:
                while True:
-                    step = self.step_queue.get_nowait()
-                    if step.name == "env_close":
-                        raise UnityCommunicationException(
-                            "At least one of the environments has closed."
-                        )
+                    step: EnvironmentResponse = self.step_queue.get_nowait()
+                    if step.cmd == EnvironmentCommand.ENV_EXITED:
+                        env_exception: Exception = step.payload
+                        raise env_exception
                    self.env_workers[step.worker_id].waiting = False
                    if step.worker_id not in step_workers:
                        worker_steps.append(step)
                self.env_workers[step.worker_id].waiting = False
        # First enqueue reset commands for all workers so that they reset in parallel
        for ew in self.env_workers:
-            ew.send("reset", config)
+            ew.send(EnvironmentCommand.RESET, config)
        # Next (synchronously) collect the reset observations from each worker in sequence
        for ew in self.env_workers:
            ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {}, {})
    def external_brains(self) -> Dict[AgentGroup, BrainParameters]:
-        self.env_workers[0].send("external_brains")
+        self.env_workers[0].send(EnvironmentCommand.EXTERNAL_BRAINS)
-        self.env_workers[0].send("get_properties")
+        self.env_workers[0].send(EnvironmentCommand.GET_PROPERTIES)
        return self.env_workers[0].recv().payload

    def close(self) -> None:
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
 VIS_OBS_SIZE = (20, 20, 3)
 STEP_SIZE = 0.1

-TIME_PENALTY = 0.001
+TIME_PENALTY = 0.01
 MIN_STEPS = int(1.0 / STEP_SIZE) + 1
 SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY

--- a/ml-agents/mlagents/trainers/tests/test_distributions.py
+++ b/ml-agents/mlagents/trainers/tests/test_distributions.py

 def test_gaussian_distribution():
    with tf.Graph().as_default():
-        logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
+        logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
        distribution = GaussianDistribution(
            logits,
            act_size=VECTOR_ACTION_SPACE,
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
+            # Test entropy is correct
+            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
+                "log_std/BiasAdd:0"
+            )
+            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
+            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
+            # Entropy with log_std of 1.0 should be 2.42
+            assert pytest.approx(entropy[0], 0.01) == 2.42


 def test_tanh_distribution():
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
        lambd: 0.95
        learning_rate: 5.0e-3
        learning_rate_schedule: constant
-        max_steps: 2000
+        max_steps: 3000
        memory_size: 16
        normalize: false
        num_epoch: 3
 # Custom reward processors shuld be built within the test function and passed to _check_environment_trains
 # Default is average over the last 5 final rewards
 def default_reward_processor(rewards, last_n_rewards=5):
+    rewards_to_use = rewards[-last_n_rewards:]
+    # For debugging tests
+    print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
    return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()


    trainer_config,
    reward_processor=default_reward_processor,
    meta_curriculum=None,
-    success_threshold=0.99,
+    success_threshold=0.9,
    env_manager=None,
 ):
    # Create controller and begin training.
        if (
            success_threshold is not None
        ):  # For tests where we are just checking setup and not reward
-
            processed_rewards = [
                reward_processor(rewards) for rewards in env.final_rewards.values()
            ]
 def test_recurrent_ppo(use_discrete):
    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
    override_vals = {
-        "max_steps": 4000,
+        "max_steps": 5000,
        "batch_size": 64,
        "buffer_size": 128,
        "learning_rate": 1e-3,
@pytest.mark.parametrize("use_discrete", [True, False])
 def test_2d_sac(use_discrete):
    env = SimpleEnvironment(
-        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.5
+        [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
-    override_vals = {"buffer_init_steps": 2000, "max_steps": 3000}
+    override_vals = {"buffer_init_steps": 2000, "max_steps": 4000}
-    _check_environment_trains(env, config)
+    _check_environment_trains(env, config, success_threshold=0.8)


@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("use_discrete", [True, False])
 def test_recurrent_sac(use_discrete):
    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
-    override_vals = {"batch_size": 32, "use_recurrent": True, "max_steps": 2000}
+    override_vals = {
+        "batch_size": 64,
+        "use_recurrent": True,
+        "max_steps": 3000,
+        "learning_rate": 1e-3,
+        "buffer_init_steps": 500,
+    }
    config = generate_config(SAC_CONFIG, override_vals)
    _check_environment_trains(env, config)

    processed_rewards = [
        default_reward_processor(rewards) for rewards in env.final_rewards.values()
    ]
-    success_threshold = 0.99
+    success_threshold = 0.9
    assert any(reward > success_threshold for reward in processed_rewards) and any(
        reward < success_threshold for reward in processed_rewards
    )
--- a/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/tests/test_subprocess_env_manager.py
    SubprocessEnvManager,
    EnvironmentResponse,
    StepResponse,
+    EnvironmentCommand,
+from mlagents_envs.exception import UnityEnvironmentException
 from mlagents.trainers.tests.simple_test_envs import SimpleEnvironment
 from mlagents.trainers.stats import StatsReporter
 from mlagents.trainers.tests.test_simple_rl import (


 def create_worker_mock(worker_id, step_queue, env_factor, engine_c):
-    return MockEnvWorker(worker_id, EnvironmentResponse("reset", worker_id, worker_id))
+    return MockEnvWorker(
+        worker_id, EnvironmentResponse(EnvironmentCommand.RESET, worker_id, worker_id)
+    )


 class SubprocessEnvManagerTest(unittest.TestCase):
        )
        params = {"test": "params"}
        manager._reset_env(params)
-        manager.env_workers[0].send.assert_called_with("reset", (params))
+        manager.env_workers[0].send.assert_called_with(
+            EnvironmentCommand.RESET, (params)
+        )

    @mock.patch(
        "mlagents.trainers.subprocess_env_manager.SubprocessEnvManager.create_worker"
        params = {"test": "params"}
        res = manager._reset_env(params)
        for i, env in enumerate(manager.env_workers):
-            env.send.assert_called_with("reset", (params))
+            env.send.assert_called_with(EnvironmentCommand.RESET, (params))
            env.recv.assert_called()
            # Check that the "last steps" are set to the value returned for each step
            self.assertEqual(
        )
        manager.step_queue = Mock()
        manager.step_queue.get_nowait.side_effect = [
-            EnvironmentResponse("step", 0, StepResponse(0, None, {})),
-            EnvironmentResponse("step", 1, StepResponse(1, None, {})),
+            EnvironmentResponse(EnvironmentCommand.STEP, 0, StepResponse(0, None, {})),
+            EnvironmentResponse(EnvironmentCommand.STEP, 1, StepResponse(1, None, {})),
            EmptyQueue(),
        ]
        step_mock = Mock()
        res = manager._step()
        for i, env in enumerate(manager.env_workers):
            if i < 2:
-                env.send.assert_called_with("step", step_mock)
+                env.send.assert_called_with(EnvironmentCommand.STEP, step_mock)
                manager.step_queue.get_nowait.assert_called()
                # Check that the "last steps" are set to the value returned for each step
                self.assertEqual(
        env_manager.advance()
        assert env_manager.policies[brain_name] == mock_policy
        assert agent_manager_mock.policy == mock_policy
-
-
-def simple_env_factory(worker_id, config):
-    env = SimpleEnvironment(["1D"], use_discrete=True)
-    return env
+    def simple_env_factory(worker_id, config):
+        env = SimpleEnvironment(["1D"], use_discrete=True)
+        return env
+
    env_manager = SubprocessEnvManager(
        simple_env_factory, EngineConfig.default_config(), num_envs
    )
        val > 0.7 for val in StatsReporter.writers[0].get_last_rewards().values()
    )
    env_manager.close()
+
+
+@pytest.mark.parametrize("num_envs", [1, 4])
+def test_subprocess_env_raises_errors(num_envs):
+    def failing_env_factory(worker_id, config):
+        import time
+
+        # Sleep momentarily to allow time for the EnvManager to be waiting for the
+        # subprocess response.  We won't be able to capture failures from the subprocess
+        # that cause it to close the pipe before we can send the first message.
+        time.sleep(0.1)
+        raise UnityEnvironmentException()
+
+    env_manager = SubprocessEnvManager(
+        failing_env_factory, EngineConfig.default_config(), num_envs
+    )
+    with pytest.raises(UnityEnvironmentException):
+        env_manager.reset()
+    env_manager.close()
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
-        except (KeyboardInterrupt, UnityCommunicationException):
+        except (
+            KeyboardInterrupt,
+            UnityCommunicationException,
+            UnityEnvironmentException,
+        ) as ex:
-            pass
+
+            if isinstance(ex, KeyboardInterrupt):
+                pass
+            else:
+                # If the environment failed, we want to make sure to raise
+                # the exception so we exit the process with an return code of 1.
+                raise ex
        if self.train_model:
            self._export_graph()

--- a/ml-agents/tests/yamato/training_int_tests.py
+++ b/ml-agents/tests/yamato/training_int_tests.py
+import argparse
+import time

 from .yamato_utils import (
    get_base_path,
+    checkout_csharp_version,
+    undo_git_checkout,
-def main():
-    nn_file_expected = "./models/ppo/3DBall.nn"
+def run_training(python_version, csharp_version):
+    latest = "latest"
+    run_id = int(time.time() * 1000.0)
+    print(
+        f"Running training with python={python_version or latest} and c#={csharp_version or latest}"
+    )
+    nn_file_expected = f"./models/{run_id}/3DBall.nn"
    if os.path.exists(nn_file_expected):
        # Should never happen - make sure nothing leftover from an old test.
        print("Artifacts from previous build found!")
    print(f"Running in base path {base_path}")

+    if csharp_version is not None:
+        checkout_csharp_version(csharp_version)
+
-    init_venv()
+    venv_path = init_venv(python_version)

    # Copy the default training config but override the max_steps parameter,
    # and reduce the batch_size and buffer_size enough to ensure an update step happens.

    # TODO pass scene name and exe destination to build
    # TODO make sure we fail if the exe isn't found - see MLA-559
-    mla_learn_cmd = "mlagents-learn override.yaml --train --env=Project/testPlayer --no-graphics --env-args -logFile -"  # noqa
-    res = subprocess.run(f"source venv/bin/activate; {mla_learn_cmd}", shell=True)
+    mla_learn_cmd = f"mlagents-learn override.yaml --train --env=Project/testPlayer --run-id={run_id} --no-graphics --env-args -logFile -"  # noqa
+    res = subprocess.run(
+        f"source {venv_path}/bin/activate; {mla_learn_cmd}", shell=True
+    )

    if res.returncode != 0 or not os.path.exists(nn_file_expected):
        print("mlagents-learn run FAILED!")
    sys.exit(0)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--python", default=None)
+    parser.add_argument("--csharp", default=None)
+    args = parser.parse_args()
+
+    try:
+        run_training(args.python, args.csharp)
+    finally:
+        # Cleanup - this gets executed even if we hit sys.exit()
+        undo_git_checkout()


 if __name__ == "__main__":
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
    return res.returncode


-def init_venv():
+def init_venv(mlagents_python_version: str = None) -> str:
+    """
+    Set up the virtual environment, and return the venv path.
+    :param mlagents_python_version: The version of mlagents python packcage to install.
+        If None, will do a local install, otherwise will install from pypi
+    :return:
+    """
+    # Use a different venv path for different versions
+    venv_path = "venv"
+    if mlagents_python_version:
+        venv_path += "_" + mlagents_python_version
+
-    subprocess.check_call("python -m venv venv", shell=True)
+    subprocess.check_call(f"python -m venv {venv_path}", shell=True)
-        "-e ./ml-agents-envs",
-        "-e ./ml-agents",
+    if mlagents_python_version:
+        # install from pypi
+        pip_commands.append(f"mlagents=={mlagents_python_version}")
+    else:
+        # Local install
+        pip_commands += ["-e ./ml-agents-envs", "-e ./ml-agents"]
-            f"source venv/bin/activate; python -m pip install -q {cmd}", shell=True
+            f"source {venv_path}/bin/activate; python -m pip install -q {cmd}",
+            shell=True,
+        )
+    return venv_path
+
+
+def checkout_csharp_version(csharp_version):
+    """
+    Checks out the specific git revision (usually a tag) for the C# package and Project.
+    If csharp_version is None, no changes are made.
+    :param csharp_version:
+    :return:
+    """
+    if csharp_version is None:
+        return
+    csharp_dirs = ["com.unity.ml-agents", "Project"]
+    for csharp_dir in csharp_dirs:
+        subprocess.check_call(
+            f"git checkout {csharp_version} -- {csharp_dir}", shell=True
+
+
+def undo_git_checkout():
+    """
+    Clean up the git working directory.
+    """
+    subprocess.check_call("git reset HEAD .", shell=True)
+    subprocess.check_call("git checkout -- .", shell=True)


 def override_config_file(src_path, dest_path, **kwargs):