Merge branch 'master' into hh/develop/gridsensor-tests

4 年前 · a3bf96fd
--- a/com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md
+++ b/com.unity.ml-agents.extensions/Documentation~/com.unity.ml-agents.extensions.md
 # About ML-Agents Extensions package (`com.unity.ml-agents.extensions`)

-The Unity ML-Agents Extensions package optional add-ons to the C# SDK for the
+The Unity ML-Agents Extensions package contains optional add-ons to the C# SDK for the
 [Unity ML-Agents Toolkit](https://github.com/Unity-Technologies/ml-agents).

 These extensions are all considered experimental, and their API or behavior
 | _Runtime_        | Contains core C# APIs for integrating ML-Agents into your Unity scene. |
 | _Tests_          | Contains the unit tests for the package.                               |

-<a name="Installation"></a>
+## Installation
+The ML-Agents Extensions package is not currently available in the Package Manager. There are two
+recommended ways to install the package:
-## Installation
+### Local Installation
+[Clone the repository](../../docs/Installation.md#clone-the-ml-agents-toolkit-repository-optional) and follow the
+[Local Installation for Development](../../docs/Installation.md#advanced-local-installation-for-development-1)
+directions (substituting `com.unity.ml-agents.extensions` for the package name).
+
+### Github via Package Manager
+In Unity 2019.4 or later, open the Package Manager, hit the "+" button, and select "Add package from git URL".
+
+![Package Manager git URL](../../docs/images/unity_package_manager_git_url.png)
+In the dialog that appears, enter
+ ```
+ git+https://github.com/Unity-Technologies/ml-agents.git?path=com.unity.ml-agents.extensions
+```
-To install this _ML-Agents_ package, follow the instructions in the [Package
-Manager documentation](https://docs.unity3d.com/Manual/upm-ui-install.html).
+You can also edit your project's `manifest.json` directly and add the following line to the `dependencies`
+section:
+```
+"com.unity.ml-agents.extensions": "git+https://github.com/Unity-Technologies/ml-agents.git?path=com.unity.ml-agents.extensions",
+```
+See [Git dependencies](https://docs.unity3d.com/Manual/upm-git.html#subfolder) for more information.


 ## Requirements
 - 2018.4 and later

 ## Known Limitations
+none
+## Need Help?
+The main [README](../../README.md) contains links for contacting the team or getting support.
--- a/com.unity.ml-agents.extensions/README.md
+++ b/com.unity.ml-agents.extensions/README.md
 # ML-Agents Extensions

-This is a source-only package for new features based on ML-Agents.
-
-More details coming soon.
+See the [package documentation](Documentation~/com.unity.ml-agents.extensions.md) for more information
--- a/ml-agents/mlagents/trainers/sac/optimizer_torch.py
+++ b/ml-agents/mlagents/trainers/sac/optimizer_torch.py
 from mlagents_envs.timers import timed
 from mlagents.trainers.exception import UnityTrainerException
 from mlagents.trainers.settings import TrainerSettings, SACSettings
+from contextlib import ExitStack

 EPSILON = 1e-6  # Small value to avoid divide by zero

            actions: Optional[torch.Tensor] = None,
            memories: Optional[torch.Tensor] = None,
            sequence_length: int = 1,
+            q1_grad: bool = True,
+            q2_grad: bool = True,
-            q1_out, _ = self.q1_network(
-                vec_inputs,
-                vis_inputs,
-                actions=actions,
-                memories=memories,
-                sequence_length=sequence_length,
-            )
-            q2_out, _ = self.q2_network(
-                vec_inputs,
-                vis_inputs,
-                actions=actions,
-                memories=memories,
-                sequence_length=sequence_length,
-            )
+            """
+            Performs a forward pass on the value network, which consists of a Q1 and Q2
+            network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
+            :param vec_inputs: List of vector observation tensors.
+            :param vis_input: List of visual observation tensors.
+            :param actions: For a continuous Q function (has actions), tensor of actions.
+                Otherwise, None.
+            :param memories: Initial memories if using memory. Otherwise, None.
+            :param sequence_length: Sequence length if using memory.
+            :param q1_grad: Whether or not to compute gradients for the Q1 network.
+            :param q2_grad: Whether or not to compute gradients for the Q2 network.
+            :return: Tuple of two dictionaries, which both map {reward_signal: Q} for Q1 and Q2,
+                respectively.
+            """
+            # ExitStack allows us to enter the torch.no_grad() context conditionally
+            with ExitStack() as stack:
+                if not q1_grad:
+                    stack.enter_context(torch.no_grad())
+                q1_out, _ = self.q1_network(
+                    vec_inputs,
+                    vis_inputs,
+                    actions=actions,
+                    memories=memories,
+                    sequence_length=sequence_length,
+                )
+            with ExitStack() as stack:
+                if not q2_grad:
+                    stack.enter_context(torch.no_grad())
+                q2_out, _ = self.q2_network(
+                    vec_inputs,
+                    vis_inputs,
+                    actions=actions,
+                    memories=memories,
+                    sequence_length=sequence_length,
+                )
            return q1_out, q2_out

    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        )
        if self.policy.use_continuous_act:
            squeezed_actions = actions.squeeze(-1)
+            # Only need grad for q1, as that is used for policy.
            q1p_out, q2p_out = self.value_network(
                vec_obs,
                vis_obs,
+                q2_grad=False,
            )
            q1_out, q2_out = self.value_network(
                vec_obs,
            )
            q1_stream, q2_stream = q1_out, q2_out
        else:
-            with torch.no_grad():
-                q1p_out, q2p_out = self.value_network(
-                    vec_obs,
-                    vis_obs,
-                    memories=q_memories,
-                    sequence_length=self.policy.sequence_length,
-                )
+            # For discrete, you don't need to backprop through the Q for the policy
+            q1p_out, q2p_out = self.value_network(
+                vec_obs,
+                vis_obs,
+                memories=q_memories,
+                sequence_length=self.policy.sequence_length,
+                q1_grad=False,
+                q2_grad=False,
+            )
            q1_out, q2_out = self.value_network(
                vec_obs,
                vis_obs,
--- a/ml-agents/mlagents/trainers/tests/torch/saver/test_saver_reward_providers.py
+++ b/ml-agents/mlagents/trainers/tests/torch/saver/test_saver_reward_providers.py
 import pytest
 import os

+import numpy as np
+
 from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
 from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
 from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver
    CuriositySettings,
    GAILSettings,
+    RNDSettings,
+from mlagents.trainers.tests.torch.test_reward_providers.utils import (
+    create_agent_buffer,
+)

 DEMO_PATH = (
    os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir)
    trainer_settings.reward_signals = {
        RewardSignalType.CURIOSITY: CuriositySettings(),
        RewardSignalType.GAIL: GAILSettings(demo_path=DEMO_PATH),
+        RewardSignalType.RND: RNDSettings(),
    }
    policy = create_policy_mock(trainer_settings, use_discrete=False)
    optimizer = OptimizerClass(policy, trainer_settings)
    module_dict_2 = optimizer2.get_modules()
    assert "Module:GAIL" in module_dict_1
    assert "Module:GAIL" in module_dict_2
+    assert "Module:Curiosity" in module_dict_1
+    assert "Module:Curiosity" in module_dict_2
+    assert "Module:RND-pred" in module_dict_1
+    assert "Module:RND-pred" in module_dict_2
+    assert "Module:RND-target" in module_dict_1
+    assert "Module:RND-target" in module_dict_2
    for name, module1 in module_dict_1.items():
        assert name in module_dict_2
        module2 = module_dict_2[name]
+
+    # Run some rewards
+    data = create_agent_buffer(policy.behavior_spec, 1)
+    for reward_name in optimizer.reward_signals.keys():
+        rp_1 = optimizer.reward_signals[reward_name]
+        rp_2 = optimizer2.reward_signals[reward_name]
+        assert np.array_equal(rp_1.evaluate(data), rp_2.evaluate(data))
--- a/ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py
+++ b/ml-agents/mlagents/trainers/torch/components/reward_providers/rnd_reward_provider.py
        self.optimizer.step()
        return {"Losses/RND Loss": loss.detach().cpu().numpy()}

+    def get_modules(self):
+        return {
+            f"Module:{self.name}-pred": self._training_network,
+            f"Module:{self.name}-target": self._random_network,
+        }
+

 class RNDNetwork(torch.nn.Module):
    EPSILON = 1e-10
--- a/docs/images/unity_package_manager_git_url.png
+++ b/docs/images/unity_package_manager_git_url.png