Basic and visual GAIL and BC integration tests (#3626)

5 年前 · 2912c883
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
    @property
    def agent_id_to_index(self) -> Dict[AgentId, int]:
        """
-        Returns the index of the agent_id in this BatchedStepResult, and
-        -1 if agent_id is not in this BatchedStepResult.
-        :param agent_id: The id of the agent
-        :returns: The index of the agent_id, and -1 if not found.
+        :returns: A Dict that maps agent_id to the index of those agents in
+        this BatchedStepResult.
        """
        if self._agent_id_to_index is None:
            self._agent_id_to_index = {}
        """
        if not self.contains_agent(agent_id):
            raise IndexError(
-                "agent_id {} is not present in the BatchedStepResult".format(agent_id)
+                "get_agent_step_result failed. agent_id {} is not present in the BatchedStepResult".format(
+                    agent_id
+                )
            )
        agent_index = self._agent_id_to_index[agent_id]  # type: ignore
        agent_obs = []
--- a/ml-agents-envs/mlagents_envs/rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/rpc_utils.py
 from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
 from mlagents_envs.communicator_objects.observation_pb2 import (
    ObservationProto,
-    NONE as COMPRESSION_NONE,
+    NONE as COMPRESSION_TYPE_NONE,
 )
 from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
 import numpy as np
                f"Observation did not have the expected shape - got {obs.shape} but expected {expected_shape}"
            )
    gray_scale = obs.shape[2] == 1
-    if obs.compression_type == COMPRESSION_NONE:
+    if obs.compression_type == COMPRESSION_TYPE_NONE:
        img = np.array(obs.float_data.data, dtype=np.float32)
        img = np.reshape(img, obs.shape)
        return img
--- a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
    PNG,
 )
 from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
-from mlagents_envs.base_env import AgentGroupSpec, ActionType
+from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
+    AgentInfoActionPairProto,
+)
+from mlagents_envs.communicator_objects.agent_action_pb2 import AgentActionProto
+from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult
 from mlagents_envs.exception import UnityObservationException
 from mlagents_envs.rpc_utils import (
    agent_group_spec_from_proto,
    obs_proto.compression_type = NONE
    obs_proto.shape.extend(in_array.shape)
    return obs_proto
+
+
+def proto_from_batched_step_result(
+    batched_step_result: BatchedStepResult
+) -> List[AgentInfoProto]:
+    agent_info_protos: List[AgentInfoProto] = []
+    for agent_id in batched_step_result.agent_id:
+        agent_id_index = batched_step_result.agent_id_to_index[agent_id]
+        reward = batched_step_result.reward[agent_id_index]
+        done = batched_step_result.done[agent_id_index]
+        max_step_reached = batched_step_result.max_step[agent_id_index]
+        agent_mask = None
+        if batched_step_result.action_mask is not None:
+            agent_mask = []  # type: ignore
+            for _branch in batched_step_result.action_mask:
+                agent_mask = np.concatenate(
+                    (agent_mask, _branch[agent_id_index, :]), axis=0
+                )
+        observations: List[ObservationProto] = []
+        for all_observations_of_type in batched_step_result.obs:
+            observation = all_observations_of_type[agent_id_index]
+            if len(observation.shape) == 3:
+                observations.append(generate_uncompressed_proto_obs(observation))
+            else:
+                observations.append(
+                    ObservationProto(
+                        float_data=ObservationProto.FloatData(data=observation),
+                        shape=[len(observation)],
+                        compression_type=NONE,
+                    )
+                )
+
+        agent_info_proto = AgentInfoProto(
+            reward=reward,
+            done=done,
+            id=agent_id,
+            max_step_reached=max_step_reached,
+            action_mask=agent_mask,
+            observations=observations,
+        )
+        agent_info_protos.append(agent_info_proto)
+    return agent_info_protos
+
+
+# The arguments here are the BatchedStepResult and actions for a single agent name
+def proto_from_batched_step_result_and_action(
+    batched_step_result: BatchedStepResult, actions: np.ndarray
+) -> List[AgentInfoActionPairProto]:
+    agent_info_protos = proto_from_batched_step_result(batched_step_result)
+    agent_action_protos = [
+        AgentActionProto(vector_actions=action) for action in actions
+    ]
+    agent_info_action_pair_protos = [
+        AgentInfoActionPairProto(agent_info=agent_info_proto, action_info=action_proto)
+        for agent_info_proto, action_proto in zip(
+            agent_info_protos, agent_action_protos
+        )
+    ]
+    return agent_info_action_pair_protos


 def test_process_pixels():
--- a/ml-agents/mlagents/trainers/demo_loader.py
+++ b/ml-agents/mlagents/trainers/demo_loader.py
 )
 from mlagents_envs.timers import timed, hierarchical_timer
 from google.protobuf.internal.decoder import _DecodeVarint32  # type: ignore
+from google.protobuf.internal.encoder import _EncodeVarint  # type: ignore


@timed
        )


+INITIAL_POS = 33
+
+
@timed
 def load_demonstration(
    file_path: str
    """

    # First 32 bytes of file dedicated to meta-data.
-    INITIAL_POS = 33
    file_paths = get_demo_files(file_path)
    group_spec = None
    brain_param_proto = None
            f"No BrainParameters found in demonstration file at {file_path}."
        )
    return group_spec, info_action_pairs, total_expected
+
+
+def write_delimited(f, message):
+    msg_string = message.SerializeToString()
+    msg_size = len(msg_string)
+    _EncodeVarint(f.write, msg_size)
+    f.write(msg_string)
+
+
+def write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos):
+    with open(demo_path, "wb") as f:
+        # write metadata
+        write_delimited(f, meta_data_proto)
+        f.seek(INITIAL_POS)
+        write_delimited(f, brain_param_proto)
+
+        for agent in agent_info_protos:
+            write_delimited(f, agent)
--- a/ml-agents/mlagents/trainers/tests/simple_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
    BatchedStepResult,
    ActionType,
 )
+from mlagents_envs.tests.test_rpc_utils import proto_from_batched_step_result_and_action
+from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
+    AgentInfoActionPairProto,
+)

 OBS_SIZE = 1
 VIS_OBS_SIZE = (20, 20, 3)

 class Memory1DEnvironment(Simple1DEnvironment):
    def __init__(self, brain_names, use_discrete, step_size=0.2):
-        super().__init__(brain_names, use_discrete, step_size=0.2)
+        super().__init__(brain_names, use_discrete, step_size=step_size)
        # Number of steps to reveal the goal for. Lower is harder. Should be
        # less than 1/step_size to force agent to use memory
        self.num_show_steps = 2
            m_agent_id,
            action_mask,
        )
+
+
+class Record1DEnvironment(Simple1DEnvironment):
+    def __init__(
+        self,
+        brain_names,
+        use_discrete,
+        step_size=0.2,
+        num_visual=0,
+        num_vector=1,
+        n_demos=30,
+    ):
+        super().__init__(
+            brain_names,
+            use_discrete,
+            step_size=step_size,
+            num_visual=num_visual,
+            num_vector=num_vector,
+        )
+        self.demonstration_protos: Dict[str, List[AgentInfoActionPairProto]] = {}
+        self.n_demos = n_demos
+        for name in self.names:
+            self.demonstration_protos[name] = []
+
+    def step(self) -> None:
+        super().step()
+        for name in self.names:
+            self.demonstration_protos[
+                name
+            ] += proto_from_batched_step_result_and_action(
+                self.step_result[name], self.action[name]
+            )
+            self.demonstration_protos[name] = self.demonstration_protos[name][
+                -self.n_demos :
+            ]
+
+    def solve(self) -> None:
+        self.reset()
+        for _ in range(self.n_demos):
+            for name in self.names:
+                if self.discrete:
+                    self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
+                else:
+                    self.action[name] = [[float(self.goal[name])]]
+            self.step()
--- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
 from mlagents.trainers.tests.simple_test_envs import (
    Simple1DEnvironment,
    Memory1DEnvironment,
+    Record1DEnvironment,
+from mlagents.trainers.demo_loader import write_demo
+from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
+    DemonstrationMetaProto,
+)
+from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
+from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous

 BRAIN_NAME = "1D"

    assert any(reward > success_threshold for reward in processed_rewards) and any(
        reward < success_threshold for reward in processed_rewards
    )
+
+
+@pytest.fixture(scope="session")
+def simple_record(tmpdir_factory):
+    def record_demo(use_discrete, num_visual=0, num_vector=1):
+        env = Record1DEnvironment(
+            [BRAIN_NAME],
+            use_discrete=use_discrete,
+            num_visual=num_visual,
+            num_vector=num_vector,
+            n_demos=100,
+        )
+        # If we want to use true demos, we can solve the env in the usual way
+        # Otherwise, we can just call solve to execute the optimal policy
+        env.solve()
+        agent_info_protos = env.demonstration_protos[BRAIN_NAME]
+        meta_data_proto = DemonstrationMetaProto()
+        brain_param_proto = BrainParametersProto(
+            vector_action_size=[1],
+            vector_action_descriptions=[""],
+            vector_action_space_type=discrete if use_discrete else continuous,
+            brain_name=BRAIN_NAME,
+            is_training=True,
+        )
+        action_type = "Discrete" if use_discrete else "Continuous"
+        demo_path_name = "1DTest" + action_type + ".demo"
+        demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
+        write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
+        return demo_path
+
+    return record_demo
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+@pytest.mark.parametrize("trainer_config", [PPO_CONFIG, SAC_CONFIG])
+def test_gail(simple_record, use_discrete, trainer_config):
+    demo_path = simple_record(use_discrete)
+    env = Simple1DEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
+    override_vals = {
+        "max_steps": 500,
+        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
+        "reward_signals": {
+            "gail": {
+                "strength": 1.0,
+                "gamma": 0.99,
+                "encoding_size": 32,
+                "demo_path": demo_path,
+            }
+        },
+    }
+    config = generate_config(trainer_config, override_vals)
+    _check_environment_trains(env, config, success_threshold=0.9)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_gail_visual_ppo(simple_record, use_discrete):
+    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+    env = Simple1DEnvironment(
+        [BRAIN_NAME],
+        num_visual=1,
+        num_vector=0,
+        use_discrete=use_discrete,
+        step_size=0.2,
+    )
+    override_vals = {
+        "max_steps": 1000,
+        "learning_rate": 3.0e-4,
+        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
+        "reward_signals": {
+            "gail": {
+                "strength": 1.0,
+                "gamma": 0.99,
+                "encoding_size": 32,
+                "demo_path": demo_path,
+            }
+        },
+    }
+    config = generate_config(PPO_CONFIG, override_vals)
+    _check_environment_trains(env, config, success_threshold=0.9)
+
+
+@pytest.mark.parametrize("use_discrete", [True, False])
+def test_gail_visual_sac(simple_record, use_discrete):
+    demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
+    env = Simple1DEnvironment(
+        [BRAIN_NAME],
+        num_visual=1,
+        num_vector=0,
+        use_discrete=use_discrete,
+        step_size=0.2,
+    )
+    override_vals = {
+        "max_steps": 500,
+        "batch_size": 16,
+        "learning_rate": 3.0e-4,
+        "behavioral_cloning": {"demo_path": demo_path, "strength": 1.0, "steps": 1000},
+        "reward_signals": {
+            "gail": {
+                "strength": 1.0,
+                "gamma": 0.99,
+                "encoding_size": 32,
+                "demo_path": demo_path,
+            }
+        },
+    }
+    config = generate_config(SAC_CONFIG, override_vals)
+    _check_environment_trains(env, config, success_threshold=0.9)