added some docstrings

4 年前 · 12619155
--- a/ml-agents/mlagents/trainers/policy/torch_policy.py
+++ b/ml-agents/mlagents/trainers/policy/torch_policy.py
    def update_normalization(self, buffer: AgentBuffer) -> None:
        """
        If this policy normalizes vector observations, this will update the norm values in the graph.
-        :param vector_obs: The vector observations to add to the running estimate of the distribution.
+        :param buffer: The buffer with the observations to add to the running estimate
+        of the distribution.
        """
        if self.use_vec_obs and self.normalize:
            self.actor_critic.update_normalization(buffer)
        all_log_probs: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
-        :param vec_obs: List of vector observations.
-        :param vis_obs: List of visual observations.
+        :param obs: List of observations.
        :param masks: Loss masks for RNN, else None.
        :param memories: Input memories when using RNN, else None.
        :param seq_len: Sequence length when using RNN.
--- a/ml-agents/mlagents/trainers/torch/networks.py
+++ b/ml-agents/mlagents/trainers/torch/networks.py
        """
        Returns distributions from this Actor, from which actions can be sampled.
        If memory is enabled, return the memories as well.
-        :param vec_inputs: A List of vector inputs as tensors.
-        :param vis_inputs: A List of visual inputs as tensors.
+        :param vec_inputs: A List of inputs as tensors.
        :param masks: If using discrete actions, a Tensor of action masks.
        :param memories: If using memory, a Tensor of initial memories.
        :param sequence_length: If using memory, the sequence length.
    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
        """
        Get value outputs for the given obs.
-        :param vec_inputs: List of vector inputs as tensors.
-        :param vis_inputs: List of visual inputs as tensors.
+        :param inputs: List of inputs as tensors.
        :param memories: Tensor of memories, if using memory. Otherwise, None.
        :returns: Dict of reward stream to output tensor for values.
        """
        """
        Returns distributions, from which actions can be sampled, and value estimates.
        If memory is enabled, return the memories as well.
-        :param vec_inputs: A List of vector inputs as tensors.
-        :param vis_inputs: A List of visual inputs as tensors.
+        :param inputs: A List of vector inputs as tensors.
        :param masks: If using discrete actions, a Tensor of action masks.
        :param memories: If using memory, a Tensor of initial memories.
        :param sequence_length: If using memory, the sequence length.
        """
        Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
        """
-
        # This code will convert the ugly vec and obs into glorious unified list of inputs
        concatenated_vec_obs = vec_inputs[0]
        inputs = []
--- a/ml-agents/mlagents/trainers/torch/utils.py
+++ b/ml-agents/mlagents/trainers/torch/utils.py
        vis_encode_type: EncoderType,
    ) -> Tuple[nn.Module, int]:
        """
-        Returns the encoder and the size of the generated embedding
+        Returns the encoder and the size of the appropriate encoder.
+        :param shape: Tuples that represent the observation dimension.
+        :param normalize: Normalize all vector inputs.
+        :param h_size: Number of hidden units per layer.
+        :param vis_encode_type: Type of visual encoder to use.
        """
        if len(shape) == 1:
            # Case rank 1 tensor
--- a/ml-agents/mlagents/trainers/trajectory.py
+++ b/ml-agents/mlagents/trainers/trajectory.py

 class ObsUtil:
    @staticmethod
-    def get_obs_with_rank(observations: List[np.array], rank: int) -> List[np.array]:
-        result: List[np.array] = []
-        for obs in observations:
-            if len(obs.shape) == rank:
-                result += [obs]
-        return result
-
-    @staticmethod
+        """
+        returns the name of the observation given the index of the observation
+        """
+        """
+        returns the name of the next observation given the index of the observation
+        """
+        """
+        Creates the list of observations from an AgentBuffer
+        """
        result: List[np.array] = []
        for i in range(num_obs):
            result.append(batch[ObsUtil.get_name_at(i)])
    def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of next observations from an AgentBuffer
+        """
        result = []
        for i in range(num_obs):
            result.append(batch[ObsUtil.get_name_at_next(i)])
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider
-import numpy as np
-from typing import Dict
-from mlagents.torch_utils import torch, default_device
-
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
-    BaseRewardProvider,
-)
-from mlagents.trainers.settings import CuriositySettings
-
-from mlagents_envs.base_env import BehaviorSpec
-from mlagents.trainers.torch.utils import ModelUtils
-from mlagents.trainers.torch.networks import NetworkBody
-from mlagents.trainers.torch.layers import LinearEncoder, linear_layer
-from mlagents.trainers.settings import NetworkSettings, EncoderType
-from mlagents.trainers.trajectory import ObsUtil
-
-
-class CuriosityRewardProvider(BaseRewardProvider):
-    beta = 0.2  # Forward vs Inverse loss weight
-    loss_multiplier = 10.0  # Loss multiplier
-
-    def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None:
-        super().__init__(specs, settings)
-        self._ignore_done = True
-        self._network = CuriosityNetwork(specs, settings)
-        self._network.to(default_device())
-
-        self.optimizer = torch.optim.Adam(
-            self._network.parameters(), lr=settings.learning_rate
-        )
-        self._has_updated_once = False
-
-    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
-        with torch.no_grad():
-            rewards = ModelUtils.to_numpy(self._network.compute_reward(mini_batch))
-        rewards = np.minimum(rewards, 1.0 / self.strength)
-        return rewards * self._has_updated_once
-
-    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
-        self._has_updated_once = True
-        forward_loss = self._network.compute_forward_loss(mini_batch)
-        inverse_loss = self._network.compute_inverse_loss(mini_batch)
-
-        loss = self.loss_multiplier * (
-            self.beta * forward_loss + (1.0 - self.beta) * inverse_loss
-        )
-        self.optimizer.zero_grad()
-        loss.backward()
-        self.optimizer.step()
-        return {
-            "Losses/Curiosity Forward Loss": forward_loss.item(),
-            "Losses/Curiosity Inverse Loss": inverse_loss.item(),
-        }
-
-    def get_modules(self):
-        return {f"Module:{self.name}": self._network}
-
-
-class CuriosityNetwork(torch.nn.Module):
-    EPSILON = 1e-10
-
-    def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None:
-        super().__init__()
-        self._action_spec = specs.action_spec
-        state_encoder_settings = NetworkSettings(
-            normalize=False,
-            hidden_units=settings.encoding_size,
-            num_layers=2,
-            vis_encode_type=EncoderType.SIMPLE,
-            memory=None,
-        )
-        self._state_encoder = NetworkBody(
-            specs.observation_shapes, state_encoder_settings
-        )
-
-        self._action_flattener = ModelUtils.ActionFlattener(self._action_spec)
-
-        self.inverse_model_action_prediction = torch.nn.Sequential(
-            LinearEncoder(2 * settings.encoding_size, 1, 256),
-            linear_layer(256, self._action_flattener.flattened_size),
-        )
-
-        self.forward_model_next_state_prediction = torch.nn.Sequential(
-            LinearEncoder(
-                settings.encoding_size + self._action_flattener.flattened_size, 1, 256
-            ),
-            linear_layer(256, settings.encoding_size),
-        )
-
-    def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Extracts the current state embedding from a mini_batch.
-        """
-        n_obs = len(self._state_encoder.encoders)
-        obs = ObsUtil.from_buffer(mini_batch, n_obs)
-        # Convert to tensors
-        obs = [ModelUtils.list_to_tensor(obs) for obs in obs]
-
-        hidden, _ = self._state_encoder.forward(obs)
-        return hidden
-
-    def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Extracts the next state embedding from a mini_batch.
-        """
-        n_obs = len(self._state_encoder.encoders)
-        obs = ObsUtil.from_buffer_next(mini_batch, n_obs)
-        # Convert to tensors
-        obs = [ModelUtils.list_to_tensor(obs) for obs in obs]
-
-        hidden, _ = self._state_encoder.forward(obs)
-        return hidden
-
-    def predict_action(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        In the continuous case, returns the predicted action.
-        In the discrete case, returns the logits.
-        """
-        inverse_model_input = torch.cat(
-            (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1
-        )
-        hidden = self.inverse_model_action_prediction(inverse_model_input)
-        if self._action_spec.is_continuous():
-            return hidden
-        else:
-            branches = ModelUtils.break_into_branches(
-                hidden, self._action_spec.discrete_branches
-            )
-            branches = [torch.softmax(b, dim=1) for b in branches]
-            return torch.cat(branches, dim=1)
-
-    def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Uses the current state embedding and the action of the mini_batch to predict
-        the next state embedding.
-        """
-        if self._action_spec.is_continuous():
-            action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
-        else:
-            action = torch.cat(
-                ModelUtils.actions_to_onehot(
-                    ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
-                    self._action_spec.discrete_branches,
-                ),
-                dim=1,
-            )
-        forward_model_input = torch.cat(
-            (self.get_current_state(mini_batch), action), dim=1
-        )
-
-        return self.forward_model_next_state_prediction(forward_model_input)
-
-    def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Computes the inverse loss for a mini_batch. Corresponds to the error on the
-        action prediction (given the current and next state).
-        """
-        predicted_action = self.predict_action(mini_batch)
-        if self._action_spec.is_continuous():
-            sq_difference = (
-                ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
-                - predicted_action
-            ) ** 2
-            sq_difference = torch.sum(sq_difference, dim=1)
-            return torch.mean(
-                ModelUtils.dynamic_partition(
-                    sq_difference,
-                    ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
-                    2,
-                )[1]
-            )
-        else:
-            true_action = torch.cat(
-                ModelUtils.actions_to_onehot(
-                    ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
-                    self._action_spec.discrete_branches,
-                ),
-                dim=1,
-            )
-            cross_entropy = torch.sum(
-                -torch.log(predicted_action + self.EPSILON) * true_action, dim=1
-            )
-            return torch.mean(
-                ModelUtils.dynamic_partition(
-                    cross_entropy,
-                    ModelUtils.list_to_tensor(
-                        mini_batch["masks"], dtype=torch.float
-                    ),  # use masks not action_masks
-                    2,
-                )[1]
-            )
-
-    def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Calculates the curiosity reward for the mini_batch. Corresponds to the error
-        between the predicted and actual next state.
-        """
-        predicted_next_state = self.predict_next_state(mini_batch)
-        target = self.get_next_state(mini_batch)
-        sq_difference = 0.5 * (target - predicted_next_state) ** 2
-        sq_difference = torch.sum(sq_difference, dim=1)
-        return sq_difference
-
-    def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
-        """
-        Computes the loss for the next state prediction
-        """
-        return torch.mean(
-            ModelUtils.dynamic_partition(
-                self.compute_reward(mini_batch),
-                ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
-                2,
-            )[1]
-        )