make_fake_trajectory/step take ActionSpec arg

4 年前 · 590adc01
--- a/ml-agents/mlagents/trainers/demo_loader.py
+++ b/ml-agents/mlagents/trainers/demo_loader.py
            != expected_behavior_spec.action_spec.discrete_branches
        ):
            raise RuntimeError(
-                "The continuous action dimensions {} in demonstration do not match the policy's {}.".format(
+                "The discrete action dimensions {} in demonstration do not match the policy's {}.".format(
-            # check observations match
+        # check observations match
        if len(behavior_spec.observation_shapes) != len(
            expected_behavior_spec.observation_shapes
        ):
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
-from typing import List, Tuple, Union
-from collections.abc import Iterable
+from typing import List, Tuple
 import numpy as np

 from mlagents.trainers.buffer import AgentBuffer
 def create_mock_steps(
    num_agents: int,
    observation_shapes: List[Tuple],
-    action_shape: Union[int, Tuple[int]] = None,
-    discrete: bool = False,
+    action_spec: ActionSpec,
    done: bool = False,
 ) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    :bool discrete: Whether or not action space is discrete
    :bool done: Whether all the agents in the batch are done
    """
-    if action_shape is None:
-        action_shape = 2
-
-    if discrete and isinstance(action_shape, Iterable):
+    if action_spec.is_discrete():
-            for action_size in action_shape  # type: ignore
+            for action_size in action_spec.discrete_branches  # type: ignore
-    if discrete:
-        action_spec = ActionSpec(0, action_shape)
-    else:
-        action_spec = ActionSpec(action_shape, ())
    behavior_spec = BehaviorSpec(observation_shapes, action_spec)
    if done:
        return (
 def create_steps_from_behavior_spec(
    behavior_spec: BehaviorSpec, num_agents: int = 1
 ) -> Tuple[DecisionSteps, TerminalSteps]:
-    action_spec = behavior_spec.action_spec
-    is_discrete = action_spec.is_discrete()
-        action_shape=action_spec.discrete_branches
-        if is_discrete
-        else action_spec.continuous_size,
-        discrete=is_discrete,
+        action_spec=behavior_spec.action_spec,
    )


+    action_spec: ActionSpec,
-    action_space: Union[int, Tuple[int]] = 2,
-    is_discrete: bool = True,
 ) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
+    action_size = action_spec.size
+    action_probs = np.ones(np.sum(action_spec.total_size), dtype=np.float32)
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
-        if is_discrete:
-            action_size = len(action_space)  # type: ignore
-            action_probs = np.ones(np.sum(action_space), dtype=np.float32)
-        else:
-            action_size = int(action_space)  # type: ignore
-            action_probs = np.ones((action_size), dtype=np.float32)
-            [[False for _ in range(branch)] for branch in action_space]  # type: ignore
-            if is_discrete
+            [
+                [False for _ in range(branch)]
+                for branch in action_spec.discrete_branches
+            ]  # type: ignore
+            if action_spec.is_discrete()
            else None
        )
        prev_action = np.ones(action_size, dtype=np.float32)
    memory_size: int = 10,
    exclude_key_list: List[str] = None,
 ) -> AgentBuffer:
-    is_discrete = behavior_spec.action_spec.is_discrete()
-    if is_discrete:
-        action_space = behavior_spec.action_spec.discrete_branches
-    else:
-        action_space = behavior_spec.action_spec.continuous_size
-        action_space=action_space,
+        action_spec=behavior_spec.action_spec,
-        is_discrete=is_discrete,
    )
    buffer = trajectory.to_agentbuffer()
    # If a key_list was given, remove those keys
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=mock_specs.action_spec,
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    # Change half of the obs to 0
    for i in range(3):
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
    ppo_dummy_config,
 )

+from mlagents_envs.base_env import ActionSpec
+

@pytest.fixture
 def dummy_config():
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
 BUFFER_INIT_SAMPLES = 64
 NUM_AGENTS = 12
+
+CONTINUOUS_ACTION_SPEC = ActionSpec(VECTOR_ACTION_SPACE, ())
+DISCRETE_ACTION_SPEC = ActionSpec(0, tuple(DISCRETE_ACTION_SPACE))


 def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
        length=time_horizon,
        observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
        max_step_complete=True,
-        action_space=DISCRETE_ACTION_SPACE if discrete else VECTOR_ACTION_SPACE,
-        is_discrete=discrete,
+        action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
    )
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False
        length=time_horizon,
        observation_shapes=behavior_spec.observation_shapes,
        max_step_complete=True,
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
        length=time_horizon + 1,
        max_step_complete=False,
        observation_shapes=behavior_spec.observation_shapes,
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
        length=15,
        observation_shapes=specs.observation_shapes,
        max_step_complete=True,
-        action_space=2,
-        is_discrete=False,
+        action_spec=specs.action_spec,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
        length=6,
        observation_shapes=specs.observation_shapes,
        max_step_complete=False,
-        action_space=2,
-        is_discrete=False,
+        action_spec=specs.action_spec,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    trajectory = make_fake_trajectory(
        length=5,
        observation_shapes=specs.observation_shapes,
+        action_spec=specs.action_spec,
-        action_space=2,
-        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    # Change half of the obs to 0
    for i in range(3):
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=behavior_spec.action_spec,
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy1.update_normalization(trajectory_buffer["vector_obs"])
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
 from mlagents.trainers.tests.test_buffer import construct_fake_buffer
 from mlagents.trainers.agent_processor import AgentManagerQueue
 from mlagents.trainers.settings import TrainerSettings, FrameworkType
+from mlagents_envs.base_env import ActionSpec


 # Add concrete implementations of abstract methods
        length=time_horizon,
        observation_shapes=[(1,)],
        max_step_complete=True,
-        action_space=[2],
+        action_spec=ActionSpec(0, (2,)),
    )
    trajectory_queue.put(trajectory)

        length=time_horizon,
        observation_shapes=[(1,)],
        max_step_complete=True,
-        action_space=[2],
+        action_spec=ActionSpec(0, (2,)),
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
--- a/ml-agents/mlagents/trainers/tests/test_trajectory.py
+++ b/ml-agents/mlagents/trainers/tests/test_trajectory.py
 from mlagents.trainers.trajectory import SplitObservations
 from mlagents.trainers.tests.mock_brain import make_fake_trajectory

+from mlagents_envs.base_env import ActionSpec
+
 VEC_OBS_SIZE = 6
 ACTION_SIZE = 4

    trajectory = make_fake_trajectory(
        length=length,
        observation_shapes=[(VEC_OBS_SIZE,), (84, 84, 3)],
-        action_space=[ACTION_SIZE],
+        action_spec=ActionSpec(ACTION_SIZE, ()),
    )
    agentbuffer = trajectory.to_agentbuffer()
    seen_keys = set()
--- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
-        action_space=[2],
+        action_spec=mock_specs.action_spec,
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()
--- a/ml-agents/mlagents/trainers/tests/torch/test_policy.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_policy.py
        memories=memories,
        seq_len=policy.sequence_length,
    )
-    assert log_probs.shape == (64, policy.action_spec.action_size)
-    assert entropy.shape == (64, policy.action_spec.action_size)
+    assert log_probs.shape == (64, policy.action_spec.size)
+    assert entropy.shape == (64, policy.action_spec.size)
    for val in values.values():
        assert val.shape == (64,)

        all_log_probs=not policy.use_continuous_act,
    )
    if discrete:
-        assert log_probs.shape == (64, sum(policy.action_spec.discrete_action_branches))
+        assert log_probs.shape == (64, sum(policy.action_spec.discrete_branches))
-        assert log_probs.shape == (64, policy.action_spec.continuous_action_size)
-    assert entropies.shape == (64, policy.action_spec.action_size)
+        assert log_probs.shape == (64, policy.action_spec.continuous_size)
+    assert entropies.shape == (64, policy.action_spec.size)

    if rnn:
        assert memories.shape == (1, 1, policy.m_size)
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
    gail_dummy_config,
 )

+from mlagents_envs.base_env import ActionSpec
+

@pytest.fixture
 def dummy_config():
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
 BUFFER_INIT_SAMPLES = 64
 NUM_AGENTS = 12
+
+CONTINUOUS_ACTION_SPEC = ActionSpec(VECTOR_ACTION_SPACE, ())
+DISCRETE_ACTION_SPEC = ActionSpec(0, tuple(DISCRETE_ACTION_SPACE))


 def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    trajectory = make_fake_trajectory(
        length=time_horizon,
        observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
+        action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
-        action_space=DISCRETE_ACTION_SPACE if discrete else VECTOR_ACTION_SPACE,
-        is_discrete=discrete,
    )
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False