cleanup, don't store mask

4 年前 · c3bc8991
--- a/ml-agents/mlagents/trainers/buffer.py
+++ b/ml-agents/mlagents/trainers/buffer.py
        if key_list is None:
            key_list = list(self.keys())
        if not self.check_length(key_list):
+            lengths = {k: len(self._fields[k]) for k in key_list}
+            lengths_str = "\n\t".join(str((k, v)) for k, v in lengths.items())
-                f"The length of the fields {key_list} were not of same length"
+                f"The length of the fields were not of same length: {lengths_str}"
            )
        for field_key in key_list:
            target_buffer[field_key].extend(
--- a/ml-agents/mlagents/trainers/demonstrations/demonstration_provider.py
+++ b/ml-agents/mlagents/trainers/demonstrations/demonstration_provider.py
    done: bool
    action: ActionTuple
    prev_action: np.ndarray
-    action_mask: np.ndarray
    interrupted: bool


            agent_buffer_trajectory[BufferKey.MASKS].append(1.0)
            agent_buffer_trajectory[BufferKey.DONE].append(exp.done)

-            # Adds the log prob and action of continuous/discrete separately
            agent_buffer_trajectory[BufferKey.CONTINUOUS_ACTION].append(
                exp.action.continuous
            )

-            # Store action masks if necessary. Note that 1 means active, while
-            # in AgentExperience False means active.
-            if exp.action_mask is not None:
-                mask = 1 - np.concatenate(exp.action_mask)
-                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
-                    mask, padding_value=1
-                )
-            else:
-                # This should never be needed unless the environment somehow doesn't supply the
-                # action mask in a discrete space.
-
-                action_shape = exp.action.discrete.shape
-                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
-                    np.ones(action_shape, dtype=np.float32), padding_value=1
-                )
            agent_buffer_trajectory[BufferKey.PREV_ACTION].append(exp.prev_action)
            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS].append(exp.reward)

    @abc.abstractmethod
    def pop_trajectories(self) -> List[DemonstrationTrajectory]:
        pass
+
+    def to_agentbuffer(self, training_length: int) -> AgentBuffer:
+        buffer_out = AgentBuffer()
+        trajectories = self.pop_trajectories()
+        for trajectory in trajectories:
+            temp_buffer = trajectory.to_agentbuffer()
+            temp_buffer.resequence_and_append(
+                buffer_out, batch_size=None, training_length=training_length
+            )
+        return buffer_out
--- a/ml-agents/mlagents/trainers/demonstrations/local_demonstration_provider.py
+++ b/ml-agents/mlagents/trainers/demonstrations/local_demonstration_provider.py
            action_tuple = LocalDemonstrationProvider._get_action_tuple(
                pair, behavior_spec.action_spec
            )
-            action_mask = None
-            if pair.agent_info.action_mask:
-                # TODO 2D?
-                action_mask = np.ndarray(
-                    [bool(m) for m in pair.agent_info.action_mask], dtype=np.bool
-                )

            exp = DemonstrationExperience(
                obs=obs,
                prev_action=previous_action,
-                action_mask=action_mask,
-            previous_action = np.ndarray(
+            previous_action = np.array(
                pair.action_info.vector_actions_deprecated, dtype=np.float32
            )
            if pair.agent_info.done or pair_index == len(info_action_pairs) - 1:

        # TODO 2D?
        continuous_np = (
-            np.ndarray(continuous_actions, dtype=np.float32)
+            np.array(continuous_actions, dtype=np.float32)
-            np.ndarray(discrete_actions, dtype=np.float32) if discrete_actions else None
+            np.array(discrete_actions, dtype=np.float32) if discrete_actions else None
+
        return ActionTuple(continuous_np, discrete_np)
--- a/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py
+import os
+import pytest
+from unittest.mock import patch
+
-import pytest
-from unittest.mock import patch
+
+from mlagents_envs.base_env import BehaviorSpec, ActionSpec
+
-import os
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.demonstrations.demonstration_provider import (
+    DemonstrationProvider,
+)
-from mlagents_envs.base_env import BehaviorSpec, ActionSpec
 from mlagents.trainers.settings import GAILSettings, RewardSignalType
 from mlagents.trainers.tests.torch.test_reward_providers.utils import (
    create_agent_buffer,
 ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20,))


+class MockDemonstrationProvider(DemonstrationProvider):
+    def __init__(self, behavior_spec, buffer):
+        self._behavior_spec = behavior_spec
+        self._buffer = buffer
+
+    def get_behavior_spec(self) -> BehaviorSpec:
+        return self._behavior_spec
+
+    def pop_trajectories(self):
+        raise NotImplementedError()
+
+    def to_agentbuffer(self, training_length: int) -> AgentBuffer:
+        return self._buffer
+
+
@pytest.mark.parametrize(
    "behavior_spec",
    [BehaviorSpec(create_observation_specs_with_shapes([(8,)]), ACTIONSPEC_CONTINUOUS)],
    ],
 )
@pytest.mark.parametrize("use_actions", [False, True])
-@patch(
-    "mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer"
-)
+@patch.object(GAILRewardProvider, "_get_demonstration_provider")
-    demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
+    mock_demo_provider: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
-    demo_to_buffer.return_value = None, buffer_expert
+    mock_demo_provider.return_value = MockDemonstrationProvider(
+        behavior_spec, buffer_expert
+    )
    gail_settings = GAILSettings(
        demo_path="", learning_rate=0.005, use_vail=False, use_actions=use_actions
    )
    ],
 )
@pytest.mark.parametrize("use_actions", [False, True])
-@patch(
-    "mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer"
-)
+@patch.object(GAILRewardProvider, "_get_demonstration_provider")
-    demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
+    mock_demo_provider: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
-    demo_to_buffer.return_value = None, buffer_expert
+    mock_demo_provider.return_value = MockDemonstrationProvider(
+        behavior_spec, buffer_expert
+    )
    gail_settings = GAILSettings(
        demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions
    )
--- a/ml-agents/mlagents/trainers/torch/components/bc/module.py
+++ b/ml-agents/mlagents/trainers/torch/components/bc/module.py
 from mlagents.torch_utils import torch

 from mlagents.trainers.policy.torch_policy import TorchPolicy
-from mlagents.trainers.demo_loader import demo_to_buffer
+from mlagents.trainers.demonstrations.demonstration_provider import (
+    DemonstrationProvider,
+)
+from mlagents.trainers.demonstrations.local_demonstration_provider import (
+    LocalDemonstrationProvider,
+)
 from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_log_probs import ActionLogProbs
        )
        params = self.policy.actor_critic.parameters()
        self.optimizer = torch.optim.Adam(params, lr=self.current_lr)
-        _, self.demonstration_buffer = demo_to_buffer(
-            settings.demo_path, policy.sequence_length, policy.behavior_spec
+
+        demo_provider = self._get_demonstration_provider(settings)
+        # TODO check policy.behavior_spec == demo_provider_spec
+        self.demonstration_buffer = demo_provider.to_agentbuffer(
+            training_length=policy.sequence_length
+
        self.batch_size = (
            settings.batch_size if settings.batch_size else default_batch_size
        )
        self.has_updated = False
        self.use_recurrent = self.policy.use_recurrent
        self.samples_per_update = settings.samples_per_update
+
+    def _get_demonstration_provider(
+        self, settings: BehavioralCloningSettings
+    ) -> DemonstrationProvider:
+        """
+        Get the DemonstrationProvider as determined by the BehavioralCloningSettings.
+        This is currently always a LocalDemonstrationProvider but could change in the future,
+        based on the settings.
+        """
+        return LocalDemonstrationProvider(settings.demo_path)

    def update(self) -> Dict[str, np.ndarray]:
        """
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
 )
 from mlagents.trainers.settings import GAILSettings
 from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.demonstrations.demonstration_provider import (
+    DemonstrationProvider,
+)
+from mlagents.trainers.demonstrations.local_demonstration_provider import (
+    LocalDemonstrationProvider,
+)
 from mlagents.trainers.torch.utils import ModelUtils
 from mlagents.trainers.torch.agent_action import AgentAction
 from mlagents.trainers.torch.action_flattener import ActionFlattener
-from mlagents.trainers.demo_loader import demo_to_buffer
 from mlagents.trainers.trajectory import ObsUtil


        self._ignore_done = True
        self._discriminator_network = DiscriminatorNetwork(specs, settings)
        self._discriminator_network.to(default_device())
-        _, self._demo_buffer = demo_to_buffer(
-            settings.demo_path, 1, specs
-        )  # This is supposed to be the sequence length but we do not have access here
+
+        demo_provider = self._get_demonstration_provider(settings)
+        # TODO check spec == demo_provider_spec
+        self._demo_buffer = demo_provider.to_agentbuffer(training_length=1)
+
+
+    def _get_demonstration_provider(
+        self, settings: GAILSettings
+    ) -> DemonstrationProvider:
+        """
+        Get the DemonstrationProvider as determined by the GAILSettings.
+        This is currently always a LocalDemonstrationProvider but could change in the future,
+        based on the settings.
+        """
+        return LocalDemonstrationProvider(settings.demo_path)

    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
        with torch.no_grad():

    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
        expert_batch = self._demo_buffer.sample_mini_batch(
-            mini_batch.num_experiences, 1
+            mini_batch.num_experiences, sequence_length=1
        )
        loss, stats_dict = self._discriminator_network.compute_loss(
            mini_batch, expert_batch