比较提交

...
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/trainer_controller.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/sac/trainer.py
/ml-agents/mlagents/trainers/tests/test_trainer_util.py
/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
/ml-agents/mlagents/trainers/trainer.py
/ml-agents/mlagents/trainers/rl_trainer.py
/UnitySDK/ProjectSettings/EditorBuildSettings.asset
/UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs
/UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs
/UnitySDK/ProjectSettings/DynamicsManager.asset
/UnitySDK/ProjectSettings/EditorSettings.asset
/UnitySDK/Assets/ML-Agents/Examples/Tennis/Prefabs/TennisArea.prefab
/ml-agents/mlagents/trainers/trainer_util.py
/ml-agents/mlagents/trainers/tests/test_ppo.py
/ml-agents/mlagents/trainers/tests/test_sac.py

16 次代码提交

共有 17 个文件被更改,包括 208 次插入86 次删除
  1. 2
      UnitySDK/ProjectSettings/DynamicsManager.asset
  2. 5
      UnitySDK/ProjectSettings/EditorBuildSettings.asset
  3. 9
      UnitySDK/ProjectSettings/EditorSettings.asset
  4. 6
      UnitySDK/Assets/ML-Agents/Examples/Tennis/Prefabs/TennisArea.prefab
  5. 1
      UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs
  6. 11
      UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs
  7. 22
      ml-agents/mlagents/trainers/trainer.py
  8. 16
      ml-agents/mlagents/trainers/trainer_util.py
  9. 49
      ml-agents/mlagents/trainers/ppo/trainer.py
  10. 14
      ml-agents/mlagents/trainers/rl_trainer.py
  11. 40
      ml-agents/mlagents/trainers/sac/trainer.py
  12. 10
      ml-agents/mlagents/trainers/tests/test_ppo.py
  13. 7
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  14. 6
      ml-agents/mlagents/trainers/tests/test_sac.py
  15. 4
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  16. 12
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  17. 80
      ml-agents/mlagents/trainers/trainer_controller.py

2
UnitySDK/ProjectSettings/DynamicsManager.asset


PhysicsManager:
m_ObjectHideFlags: 0
serializedVersion: 7
m_Gravity: {x: 0, y: -9.81, z: 0}
m_Gravity: {x: 0, y: -19.81, z: 0}
m_DefaultMaterial: {fileID: 0}
m_BounceThreshold: 2
m_SleepThreshold: 0.005

5
UnitySDK/ProjectSettings/EditorBuildSettings.asset


EditorBuildSettings:
m_ObjectHideFlags: 0
serializedVersion: 2
m_Scenes: []
m_Scenes:
- enabled: 1
path: Assets/ML-Agents/Examples/Tennis/Scenes/Tennis.unity
guid: 25c0c9e81e55c4e129e1a5c0ac254100

9
UnitySDK/ProjectSettings/EditorSettings.asset


--- !u!159 &1
EditorSettings:
m_ObjectHideFlags: 0
serializedVersion: 4
serializedVersion: 7
m_LineEndingsForNewScripts: 1
m_ProjectGenerationIncludedExtensions: txt;xml;fnt;cd
m_EtcTextureCompressorBehavior: 0
m_EtcTextureFastCompressor: 2
m_EtcTextureNormalCompressor: 2
m_EtcTextureBestCompressor: 5
m_ProjectGenerationIncludedExtensions: txt;xml;fnt;cd;asmdef
m_ProjectGenerationRootNamespace:
m_UserGeneratedProjectSuffix:
m_CollabEditorSettings:

6
UnitySDK/Assets/ML-Agents/Examples/Tennis/Prefabs/TennisArea.prefab


vectorActionSize: 02000000
vectorActionDescriptions: []
vectorActionSpaceType: 1
m_Model: {fileID: 11400000, guid: d6c5e749e4ceb4cf79640a5955706d3d, type: 3}
m_Model: {fileID: 0}
m_TeamID: 0
--- !u!114 &114399072728845634
MonoBehaviour:
m_ObjectHideFlags: 1

vectorActionSize: 02000000
vectorActionDescriptions: []
vectorActionSpaceType: 1
m_Model: {fileID: 11400000, guid: d6c5e749e4ceb4cf79640a5955706d3d, type: 3}
m_Model: {fileID: 0}
m_TeamID: 1
--- !u!114 &114800310164848628
MonoBehaviour:
m_ObjectHideFlags: 1

1
UnitySDK/Assets/ML-Agents/Editor/BehaviorParametersEditor.cs


EditorGUI.indentLevel--;
EditorGUILayout.PropertyField(so.FindProperty("m_BehaviorType"));
// EditorGUILayout.PropertyField(serializedObject.FindProperty("m_Heuristic"), true);
EditorGUILayout.PropertyField(so.FindProperty("m_TeamID"));
EditorGUI.indentLevel--;
if (EditorGUI.EndChangeCheck())
{

11
UnitySDK/Assets/ML-Agents/Scripts/Policy/BehaviorParameters.cs


using Barracuda;
using System;
using System.Collections.Generic;
using UnityEngine;
namespace MLAgents

[HideInInspector]
[SerializeField]
string m_BehaviorName = "My Behavior";
[HideInInspector] [SerializeField]
int m_TeamID = 0;
public BrainParameters brainParameters
{
get { return m_BrainParameters; }

{
get { return m_BehaviorName; }
get { return m_BehaviorName + "?team=" + m_TeamID;}
}
public IPolicy GeneratePolicy(Func<float[]> heuristic)

case BehaviorType.Default:
if (FindObjectOfType<Academy>().IsCommunicatorOn)
{
return new RemotePolicy(m_BrainParameters, m_BehaviorName);
return new RemotePolicy(m_BrainParameters, behaviorName);
}
if (m_Model != null)
{

22
ml-agents/mlagents/trainers/trainer.py


def __init__(
self,
brain: BrainParameters,
brain_name: str,
trainer_parameters: dict,
training: bool,
run_id: str,

:int reward_buff_cap:
"""
self.param_keys: List[str] = []
self.brain_name = brain.brain_name
self.brain_name = brain_name
self.run_id = run_id
self.trainer_parameters = trainer_parameters
self.summary_path = trainer_parameters["summary_path"]

self.summary_writer = tf.summary.FileWriter(self.summary_path)
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy: TFPolicy = None
self.policies: Dict[str, TFPolicy] = {}
self.step: int = 0
def check_param_keys(self):

def add_experiences(
self,
name_behavior_id: str,
curr_info: BrainInfo,
next_info: BrainInfo,
take_action_outputs: ActionInfoOutputs,

:param name_behavior_id: string policy identifier.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.

def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo
self, name_behavior_id: str, current_info: BrainInfo, next_info: BrainInfo
:param name_behavior_id: string policy identifier.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
"""

Uses demonstration_buffer to update model.
"""
raise UnityTrainerException("The update_model method was not implemented.")
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
"""
Creates policy
"""
raise UnityTrainerException("The update_model method was not implemented.")
def get_policy(self, brain_name: str) -> TFPolicy:
"""
Gets policy from trainers list of policies
"""
return self.policies[brain_name]

16
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.envs.exception import UnityEnvironmentException
from mlagents.trainers.trainer import Trainer
from mlagents.envs.brain import BrainParameters
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer

self.meta_curriculum = meta_curriculum
self.multi_gpu = multi_gpu
def generate(self, brain_parameters: BrainParameters) -> Trainer:
def generate(self, brain_name: str) -> Trainer:
brain_parameters,
brain_name,
self.summaries_dir,
self.run_id,
self.model_path,

def initialize_trainer(
trainer_config: Any,
brain_parameters: BrainParameters,
brain_name: str,
summaries_dir: str,
run_id: str,
model_path: str,

some general training session options.
:param trainer_config: Original trainer configuration loaded from YAML
:param brain_parameters: BrainParameters provided by the Unity environment
:param brain_name: Name of the brain to be associated with trainer
:param summaries_dir: Directory to store trainer summary statistics
:param run_id: Run ID to associate with this training run
:param model_path: Path to save the model

:return:
"""
trainer_parameters = trainer_config["default"].copy()
brain_name = brain_parameters.brain_name
trainer_parameters["summary_path"] = "{basedir}/{name}".format(
basedir=summaries_dir, name=str(run_id) + "_" + brain_name
)

trainer = None
if trainer_parameters["trainer"] == "offline_bc":
trainer = OfflineBCTrainer(
brain_parameters, trainer_parameters, train_model, load_model, seed, run_id
brain_name, trainer_parameters, train_model, load_model, seed, run_id
brain_parameters,
brain_name,
meta_curriculum.brains_to_curriculums[brain_name].min_lesson_length
if meta_curriculum
else 1,

)
elif trainer_parameters["trainer"] == "sac":
trainer = SACTrainer(
brain_parameters,
brain_name,
meta_curriculum.brains_to_curriculums[brain_name].min_lesson_length
if meta_curriculum
else 1,

49
ml-agents/mlagents/trainers/ppo/trainer.py


import numpy as np
from mlagents.envs.brain import BrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput

def __init__(
self,
brain,
brain_name,
reward_buff_cap,
trainer_parameters,
training,

:param run_id: The identifier of the current run
"""
super(PPOTrainer, self).__init__(
brain, trainer_parameters, training, run_id, reward_buff_cap
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",

"reward_signals",
]
self.check_param_keys()
if multi_gpu and len(get_devices()) > 1:
self.policy = MultiGpuPPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
else:
self.policy = PPOPolicy(
seed, brain, trainer_parameters, self.is_training, load
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.load = load
self.multi_gpu = multi_gpu
self.seed = seed
self.policy = None
self, current_info: BrainInfo, next_info: BrainInfo
self, name_behavior_id: str, current_info: BrainInfo, next_info: BrainInfo
:param name_behavior_id: string policy identifier.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
"""

self.stats[stat].append(val)
self.clear_update_buffer()
self.trainer_metrics.end_policy_update()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
if self.multi_gpu and len(get_devices()) > 1:
policy = MultiGpuPPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
else:
policy = PPOPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
return policy
def discount_rewards(r, gamma=0.99, value_next=0.0):

14
ml-agents/mlagents/trainers/rl_trainer.py


from typing import Dict, List, Any, NamedTuple
import numpy as np
from mlagents.envs.brain import BrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.components.reward_signals import RewardSignalResult
LOGGER = logging.getLogger("mlagents.trainers")

self.processing_buffer = ProcessingBuffer()
self.update_buffer = AgentBuffer()
self.episode_steps = {}
self.policy: TFPolicy = None
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
"""

def add_experiences(
self,
name_behavior_id: str,
curr_info: BrainInfo,
next_info: BrainInfo,
take_action_outputs: ActionInfoOutputs,

:param name_behavior_id: string policy identifier.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.

raise UnityTrainerException(
"The add_rewards_outputs method was not implemented."
)
def add_policy(self, brain_parameters: BrainParameters) -> None:
"""
Adds policy to trainers list of policies
"""
policy = self.create_policy(brain_parameters)
self.policy = policy
self.policies[brain_parameters.brain_name] = policy

40
ml-agents/mlagents/trainers/sac/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (SAC)
# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
# and implemented in https://github.com/hill-a/stable-baselines

import numpy as np
from mlagents.envs.brain import BrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput

"""
def __init__(
self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id
self,
brain_name,
reward_buff_cap,
trainer_parameters,
training,
load,
seed,
run_id,
):
"""
Responsible for collecting experiences and training SAC model.

:param seed: The seed the model will be initialized with
:param run_id: The The identifier of the current run
"""
super().__init__(brain, trainer_parameters, training, run_id, reward_buff_cap)
super().__init__(
brain_name, trainer_parameters, training, run_id, reward_buff_cap
)
self.param_keys = [
"batch_size",
"buffer_size",

]
self.check_param_keys()
self.load = load
self.seed = seed
self.policy = None
self.step = 0
self.train_interval = (

if "save_replay_buffer" in trainer_parameters
else False
)
self.policy = SACPolicy(seed, brain, trainer_parameters, self.is_training, load)
# Load the replay buffer if load
if load and self.checkpoint_replay_buffer:

self.update_buffer.num_experiences
)
)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.episode_steps = {}

)
def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo
self, name_behavior_id: str, current_info: BrainInfo, next_info: BrainInfo
:param name_behavior_id: string policy identifier.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
"""

self.update_sac_policy()
self.update_reward_signals()
self.trainer_metrics.end_policy_update()
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:
policy = SACPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
)
for _reward_signal in policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
return policy
def update_sac_policy(self) -> None:
"""

10
ml-agents/mlagents/trainers/tests/test_ppo.py


trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0", False)
trainer = PPOTrainer(
mock_brain.brain_name, 0, trainer_params, True, False, 0, "0", False
)
trainer.add_policy(mock_brain)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
# Mock out reward signal eval

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
trainer = PPOTrainer(
brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False
)
trainer.add_policy(brain_params)
rewardsout = AllRewardsOutput(
reward_signals={
"extrinsic": RewardSignalResult(

7
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def create_rl_trainer():
mock_brainparams = create_mock_brain()
trainer = RLTrainer(mock_brainparams, dummy_config(), True, 0)
trainer = RLTrainer(mock_brainparams.brain_name, dummy_config(), True, 0)
return trainer

def test_rl_trainer(add_policy_outputs, add_rewards_outputs, num_vis_obs):
trainer = create_rl_trainer()
trainer.policy = create_mock_policy()
fake_id = "fake_behavior_id"
fake_action_outputs = {
"action": [0.1, 0.1],
"value_heads": {},

num_vector_acts=2,
num_vis_observations=num_vis_obs,
)
trainer.add_experiences(mock_braininfo, mock_braininfo, fake_action_outputs)
trainer.add_experiences(
fake_id, mock_braininfo, mock_braininfo, fake_action_outputs
)
# Remove one of the agents
next_mock_braininfo = mb.create_mock_braininfo(

6
ml-agents/mlagents/trainers/tests/test_sac.py


trainer_params["summary_path"] = str(tmpdir)
trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain, 1, trainer_params, True, False, 0, 0)
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
trainer.add_policy(mock_brain)
trainer.update_buffer = mb.simulate_rollout(
env, trainer.policy, BUFFER_INIT_SAMPLES
)

# Wipe Trainer and try to load
trainer2 = SACTrainer(mock_brain, 1, trainer_params, True, True, 0, 0)
trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0)
trainer2.add_policy(mock_brain)
assert trainer2.update_buffer.num_experiences == buffer_len

4
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
trainer_mock.add_experiences.assert_called_once_with(
brain_name,
brain_name,
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
)

env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
trainer_mock.add_experiences.assert_called_once_with(
brain_name,
brain_name,
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
)

12
ml-agents/mlagents/trainers/tests/test_trainer_util.py


external_brains = {"testbrain": brain_params_mock}
def mock_constructor(self, brain, trainer_parameters, training, load, seed, run_id):
assert brain == brain_params_mock
assert brain == brain_params_mock.brain_name
assert trainer_parameters == expected_config
assert training == train_model
assert load == load_model

)
trainers = {}
for _, brain_parameters in external_brains.items():
trainers["testbrain"] = trainer_factory.generate(brain_parameters)
trainers["testbrain"] = trainer_factory.generate(
brain_parameters.brain_name
)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], OfflineBCTrainer)

multi_gpu,
):
self.trainer_metrics = TrainerMetrics("", "")
assert brain == brain_params_mock
assert brain == brain_params_mock.brain_name
assert trainer_parameters == expected_config
assert reward_buff_cap == expected_reward_buff_cap
assert training == train_model

)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters)
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
assert "testbrain" in trainers
assert isinstance(trainers["testbrain"], PPOTrainer)

)
trainers = {}
for brain_name, brain_parameters in external_brains.items():
trainers[brain_name] = trainer_factory.generate(brain_parameters)
trainers[brain_name] = trainer_factory.generate(brain_parameters.brain_name)
def test_load_config_missing_file():

80
ml-agents/mlagents/trainers/trainer_controller.py


import json
import logging
from typing import Dict, List, Optional, Set
from collections import defaultdict
import numpy as np
from mlagents.tf_utils import tf

:param resampling_interval: Specifies number of simulation steps after which reset parameters are resampled.
"""
self.trainers: Dict[str, Trainer] = {}
self.multi_trainers: Dict[str, Trainer] = {}
self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
self.trainer_factory = trainer_factory
self.model_path = model_path
self.summaries_dir = summaries_dir

else:
trainer.write_summary(global_step, delta_train_start)
def start_trainer(self, trainer: Trainer, env_manager: EnvManager) -> None:
self.trainers[trainer.brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text("Hyperparameters", trainer.parameters)
env_manager.set_policy(trainer.brain_name, trainer.policy)
last_brain_names: Set[str] = set()
last_brain_behavior_ids: Set[str] = set()
external_brains = set(env_manager.external_brains.keys())
new_brains = external_brains - last_brain_names
if last_brain_names != env_manager.external_brains.keys():
for name in new_brains:
trainer = self.trainer_factory.generate(
env_manager.external_brains[name]
)
self.start_trainer(trainer, env_manager)
last_brain_names = external_brains
external_brain_behavior_ids = set(env_manager.external_brains.keys())
new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
for name_behavior_id in new_behavior_ids:
try:
brain_name, _ = name_behavior_id.split("?")
except ValueError:
brain_name = name_behavior_id
try:
trainer = self.trainers[brain_name]
except KeyError:
trainer = self.trainer_factory.generate(brain_name)
self.trainers[brain_name] = trainer
self.logger.info(trainer)
if self.train_model:
trainer.write_tensorboard_text(
"Hyperparameters", trainer.parameters
)
trainer.add_policy(env_manager.external_brains[name_behavior_id])
env_manager.set_policy(
name_behavior_id, trainer.get_policy(name_behavior_id)
)
self.brain_name_to_identifier[brain_name].add(name_behavior_id)
last_brain_behavior_ids = external_brain_behavior_ids
n_steps = self.advance(env_manager)
for i in range(n_steps):
global_step += 1

for brain_name, trainer in self.trainers.items():
if brain_name in self.trainer_metrics:
self.trainer_metrics[brain_name].add_delta_step(delta_time_step)
if step_info.has_actions_for_brain(brain_name):
trainer.add_experiences(
step_info.previous_all_brain_info[brain_name],
step_info.current_all_brain_info[brain_name],
step_info.brain_name_to_action_info[brain_name].outputs,
)
trainer.process_experiences(
step_info.previous_all_brain_info[brain_name],
step_info.current_all_brain_info[brain_name],
)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
if step_info.has_actions_for_brain(name_behavior_id):
trainer.add_experiences(
name_behavior_id,
step_info.previous_all_brain_info[name_behavior_id],
step_info.current_all_brain_info[name_behavior_id],
step_info.brain_name_to_action_info[
name_behavior_id
].outputs,
)
trainer.process_experiences(
name_behavior_id,
step_info.previous_all_brain_info[name_behavior_id],
step_info.current_all_brain_info[name_behavior_id],
)
for brain_name, trainer in self.trainers.items():
if brain_name in self.trainer_metrics:
self.trainer_metrics[brain_name].add_delta_step(delta_time_step)

# Perform gradient descent with experience buffer
with hierarchical_timer("update_policy"):
trainer.update_policy()
env.set_policy(brain_name, trainer.policy)
for name_behavior_id in self.brain_name_to_identifier[brain_name]:
env.set_policy(
name_behavior_id, trainer.get_policy(name_behavior_id)
)
else:
# Avoid memory leak during inference
trainer.clear_update_buffer()
正在加载...
取消
保存