Renaming max_step to interrupted in TermialStep(s) (#3908)

5 年前 · 4641038e
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
 ### Major Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.
 ### Minor Changes
 #### com.unity.ml-agents (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
--- a/docs/Migrating.md
+++ b/docs/Migrating.md
  instead of `summaries/` and `models/`.
 - Trainer configuration, curriculum configuration, and parameter randomization
  configuration have all been moved to a single YAML file. (#3791)
+- `max_step` in the `TerminalStep` and `TerminalSteps` objects was renamed `interrupted`.

 ### Steps to Migrate
 - Before upgrading, copy your `Behavior Name` sections from `trainer_config.yaml` into
  the `Behavior Name` section.
  - If your training uses [parameter randomization](Training-ML-Agents.md#environment-parameter-randomization), move
  the contents of the sampler config to `parameter_randomization` in the main trainer configuration.
+- If you are using `UnityEnvironment` directly, replace `max_step` with `interrupted`
+in the `TerminalStep` and `TerminalSteps` objects.

 ## Migrating from 0.15 to Release 1

--- a/docs/Python-API.md
+++ b/docs/Python-API.md
 - `agent_id` is an int vector of length batch size containing unique identifier
  for the corresponding Agent. This is used to track Agents across simulation
  steps.
- `max_step` is an array of booleans of length batch size. Is true if the
-  associated Agent reached its maximum number of steps during the last
-  simulation step.
+ - `interrupted` is an array of booleans of length batch size. Is true if the
+ associated Agent was interrupted since the last decision step. For example,
+ if the Agent reached the maximum number of steps for the episode.

 It also has the two following methods:

 - `reward` is a float. Corresponds to the rewards collected by the agent since
  the last simulation step.
 - `agent_id` is an int and an unique identifier for the corresponding Agent.
- `max_step` is a bool. Is true if the Agent reached its maximum number of steps
-  during the last simulation step.
+ - `interrupted` is a bool. Is true if the Agent was interrupted since the last
+ decision step. For example, if the Agent reached the maximum number of steps for
+ the episode.

 #### BehaviorSpec

--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
     - obs is a list of numpy arrays observations collected by the agent.
     - reward is a float. Corresponds to the rewards collected by the agent
     since the last simulation step.
-     - max_step is a bool. Is true if the Agent reached its maximum number of
-     steps during the last simulation step.
+     - interrupted is a bool. Is true if the Agent was interrupted since the last
+     decision step. For example, if the Agent reached the maximum number of steps for
+     the episode.
-    max_step: bool
+    interrupted: bool
    agent_id: AgentId


     first dimension of the array corresponds to the batch size of the batch.
     - reward is a float vector of length batch size. Corresponds to the
     rewards collected by each agent since the last simulation step.
-     - max_step is an array of booleans of length batch size. Is true if the
-     associated Agent reached its maximum number of steps during the last
-     simulation step.
+     - interrupted is an array of booleans of length batch size. Is true if the
+     associated Agent was interrupted since the last decision step. For example, if the
+     Agent reached the maximum number of steps for the episode.
-    def __init__(self, obs, reward, max_step, agent_id):
+    def __init__(self, obs, reward, interrupted, agent_id):
-        self.max_step: np.ndarray = max_step
+        self.interrupted: np.ndarray = interrupted
        self.agent_id: np.ndarray = agent_id
        self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

        return TerminalStep(
            obs=agent_obs,
            reward=self.reward[agent_index],
-            max_step=self.max_step[agent_index],
+            interrupted=self.interrupted[agent_index],
            agent_id=agent_id,
        )

        return TerminalSteps(
            obs=obs,
            reward=np.zeros(0, dtype=np.float32),
-            max_step=np.zeros(0, dtype=np.bool),
+            interrupted=np.zeros(0, dtype=np.bool),
            agent_id=np.zeros(0, dtype=np.int32),
        )

         the rewards, the agent ids and the action masks for the Agents
         of the specified behavior. These Agents need an action this step.
         - A TerminalSteps NamedTuple containing the observations,
-         rewards, agent ids and max_step flags of the agents that had their
+         rewards, agent ids and interrupted flags of the agents that had their
         episode terminated last step.
        """
        pass
--- a/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
+++ b/ml-agents-envs/mlagents_envs/tests/test_rpc_utils.py
        agent_id_index = terminal_steps.agent_id_to_index[agent_id]
        reward = terminal_steps.reward[agent_id_index]
        done = True
-        max_step_reached = terminal_steps.max_step[agent_id_index]
+        max_step_reached = terminal_steps.interrupted[agent_id_index]

        final_observations: List[ObservationProto] = []
        for all_observations_of_type in terminal_steps.obs:
    for agent_id in range(n_agents):
        assert (agent_id in terminal_steps) == (agent_id % 2 == 0)
        if agent_id in terminal_steps:
-            assert terminal_steps[agent_id].max_step == (agent_id % 4 == 0)
+            assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0)
    assert decision_steps.obs[0].shape[1] == shapes[0][0]
    assert decision_steps.obs[1].shape[1] == shapes[1][0]
    assert terminal_steps.obs[0].shape[1] == shapes[0][0]
--- a/ml-agents/mlagents/trainers/agent_processor.py
+++ b/ml-agents/mlagents/trainers/agent_processor.py
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
-            max_step = step.max_step if terminated else False
+            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            action = stored_take_action_outputs["action"][idx]
            if self.policy.use_continuous_act:
                action_pre=action_pre,
                action_mask=action_mask,
                prev_action=prev_action,
-                max_step=max_step,
+                interrupted=interrupted,
                memory=memory,
            )
            # Add the value outputs if needed
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory,
            trajectory.next_obs,
-            trajectory.done_reached and not trajectory.max_step_reached,
+            trajectory.done_reached and not trajectory.interrupted,
        )
        for name, v in value_estimates.items():
            agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
-        if last_step.max_step:
+        if last_step.interrupted:
            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
        ]

    reward = np.array(num_agents * [1.0], dtype=np.float32)
-    max_step = np.array(num_agents * [False], dtype=np.bool)
+    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * num_vis_observations + [(num_vector_observations, 0, 0)],
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
-            TerminalSteps(obs_list, reward, max_step, agent_id),
+            TerminalSteps(obs_list, reward, interrupted, agent_id),
        )
    else:
        return (
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
-            max_step=max_step,
+            interrupted=max_step,
            memory=memory,
        )
        steps_list.append(experience)
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
-        max_step=max_step_complete,
+        interrupted=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py
    action_pre: np.ndarray  # TODO: Remove this
    action_mask: np.ndarray
    prev_action: np.ndarray
-    max_step: bool
+    interrupted: bool
    memory: np.ndarray


        return self.steps[-1].done

    @property
-    def max_step_reached(self) -> bool:
+    def interrupted(self) -> bool:
-        return self.steps[-1].max_step
+        return self.steps[-1].interrupted