浏览代码

Combined model and policy for PPO

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
9ad99eb6
共有 13 个文件被更改,包括 389 次插入109 次删除
  1. 2
      ml-agents/mlagents/trainers/components/bc/module.py
  2. 2
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
  3. 8
      ml-agents/mlagents/trainers/exception.py
  4. 3
      ml-agents/mlagents/trainers/learn.py
  5. 22
      ml-agents/mlagents/trainers/models.py
  6. 8
      ml-agents/mlagents/trainers/optimizer.py
  7. 33
      ml-agents/mlagents/trainers/ppo/optimizer.py
  8. 279
      ml-agents/mlagents/trainers/ppo/policy.py
  9. 6
      ml-agents/mlagents/trainers/rl_trainer.py
  10. 2
      ml-agents/mlagents/trainers/tests/test_ppo.py
  11. 120
      ml-agents/mlagents/trainers/tf_policy.py
  12. 10
      ml-agents/mlagents/trainers/trainer.py
  13. 3
      ml-agents/mlagents/trainers/trainer_util.py

2
ml-agents/mlagents/trainers/components/bc/module.py


from mlagents.trainers.tf_policy import TFPolicy
from .model import BCModel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.exception import UnityTrainerException
class BCModule:

2
ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py


import logging
from typing import Any, Dict, Type
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignal
from mlagents.trainers.components.reward_signals.extrinsic.signal import (
ExtrinsicRewardSignal,

8
ml-agents/mlagents/trainers/exception.py


"""
pass
class UnityTrainerException(TrainerError):
"""
Related to errors with the Trainer.
"""
pass

3
ml-agents/mlagents/trainers/learn.py


from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
from mlagents_envs.side_channel.side_channel import SideChannel
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig
import multiprocessing
multiprocessing.set_start_method("spawn", True)
def _create_parser():

22
ml-agents/mlagents/trainers/models.py


import numpy as np
from mlagents.tf_utils import tf
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.brain import CameraResolution
logger = logging.getLogger("mlagents.trainers")

self.normalization_steps = normalization_tensors[1]
self.running_mean = normalization_tensors[2]
self.running_variance = normalization_tensors[3]
self.processed_vector_in = self.normalize_vector_obs(self.vector_in)
self.processed_vector_in = LearningModel.normalize_vector_obs(
self.vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_in = self.vector_in
self.update_normalization = None

)
return vector_in
def normalize_vector_obs(self, vector_obs):
@staticmethod
def normalize_vector_obs(
vector_obs: tf.Tensor,
running_mean: tf.Tensor,
running_variance: tf.Tensor,
normalization_steps: tf.Tensor,
) -> tf.Tensor:
(vector_obs - self.running_mean)
(vector_obs - running_mean)
self.running_variance
/ (tf.cast(self.normalization_steps, tf.float32) + 1)
running_variance / (tf.cast(normalization_steps, tf.float32) + 1)
),
-5,
5,

8
ml-agents/mlagents/trainers/optimizer.py


def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size: batch.num_experiences,
self.policy.sequence_length: 1, # We want to feed data in batch-wise, not time-wise.
self.policy.batch_size_ph: batch.num_experiences,
self.policy.sequence_length_ph: 1, # We want to feed data in batch-wise, not time-wise.
}
if self.policy.vec_obs_size > 0:

"""
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size: 1,
self.policy.sequence_length: 1,
self.policy.batch_size_ph: 1,
self.policy.sequence_length_ph: 1,
}
vec_vis_obs = SplitObservations.from_observations(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):

33
ml-agents/mlagents/trainers/ppo/optimizer.py


from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
from mlagents.trainers.optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.ppo.models import PPOModel
logger = logging.getLogger("mlagents.trainers")

:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
feed_dict = self.construct_feed_dict(self.policy, batch, num_sequences)
feed_dict = self.construct_feed_dict(batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.

return update_stats
def construct_feed_dict(
self, model: PPOModel, mini_batch: AgentBuffer, num_sequences: int
self, mini_batch: AgentBuffer, num_sequences: int
model.batch_size: num_sequences,
model.sequence_length: len(mini_batch["advantages"])
self.policy.batch_size_ph: num_sequences,
self.policy.sequence_length_ph: len(mini_batch["advantages"])
model.mask_input: mini_batch["masks"],
self.policy.mask_input: mini_batch["masks"],
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
}

]
if "actions_pre" in mini_batch:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
feed_dict[model.action_holder] = mini_batch["actions"]
if model.use_recurrent:
feed_dict[model.prev_action] = mini_batch["prev_action"]
feed_dict[model.action_masks] = mini_batch["action_mask"]
feed_dict[self.policy.action_holder] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
feed_dict[model.vector_in] = mini_batch["vector_obs"]
if model.vis_obs_size > 0:
for i, _ in enumerate(model.visual_in):
feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
if model.use_recurrent:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
if self.policy.vis_obs_size > 0:
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.policy.use_recurrent:
mem_in = [
mini_batch["memory"][i]
for i in range(

feed_dict[model.memory_in] = mem_in
feed_dict[self.policy.memory_in] = mem_in
return feed_dict
def _execute_model(self, feed_dict, out_dict):

279
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.models import LearningModel
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule

"Losses/Policy Loss": "policy_loss",
}
self.create_model(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
vis_encode_type = EncoderType(trainer_params.get("vis_encode_type", "simple"))
if brain.vector_action_space_type == "continuous":
self.create_cc_actor(h_size, num_layers, vis_encode_type)
else:
self.create_dc_actor(h_size, num_layers, vis_encode_type)
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "behavioral_cloning" in trainer_params:

**trainer_params["behavioral_cloning"],
)
self.create_optimizer(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
def create_model(
def create_optimizer(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):
"""

:param seed: Random seed.
"""
with self.graph.as_default():
self.model = PPOModel(
brain=brain,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=trainer_params["normalize"],
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
policy=self.model,
policy=self,
sess=self.sess,
reward_signal_configs=reward_signal_configs,
lr=float(trainer_params["learning_rate"]),

self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"entropy": self.model.entropy,
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
self.inference_dict["pre_action"] = self.model.output_pre
self.inference_dict["pre_action"] = self.output_pre
self.inference_dict["policy_memory_out"] = self.model.memory_out
self.inference_dict["policy_memory_out"] = self.memory_out
self.total_policy_loss = self.optimizer.abs_policy_loss
self.update_dict.update(
{
"value_loss": self.optimizer.value_loss,
"policy_loss": self.total_policy_loss,
"update_batch": self.optimizer.update_batch,
}
)
@timed
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]

:return: Outputs from network as defined by self.inference_dict.
"""
feed_dict = {
self.model.batch_size: batched_step_result.n_agents(),
self.model.sequence_length: 1,
self.batch_size_ph: batched_step_result.n_agents(),
self.sequence_length_ph: 1,
feed_dict[self.model.prev_action] = self.retrieve_previous_action(
feed_dict[self.prev_action] = self.retrieve_previous_action(
feed_dict[self.model.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
size=(batched_step_result.n_agents(), self.model.act_size[0])
size=(batched_step_result.n_agents(), self.act_size[0])
feed_dict[self.model.epsilon] = epsilon
feed_dict[self.epsilon] = epsilon
def create_cc_actor(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_stream = LearningModel.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
stream_scopes=["policy"],
)[0]
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
hidden_policy, memory_policy_out = LearningModel.create_recurrent_encoder(
hidden_stream,
self.memory_in[:, :_half_point],
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = memory_policy_out
else:
hidden_policy = hidden_stream
mu = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
kernel_initializer=LearningModel.scaled_init(0.01),
reuse=tf.AUTO_REUSE,
)
self.log_sigma_sq = tf.get_variable(
"log_sigma_squared",
[self.act_size[0]],
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
# Clip and scale output to ensure actions are always within [-1, 1] range.
self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(output_post)
# Compute probability of model output.
all_probs = (
-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
- 0.5 * tf.log(2.0 * np.pi)
- 0.5 * self.log_sigma_sq
)
self.all_log_probs = tf.identity(all_probs, name="action_probs")
single_dim_entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + self.log_sigma_sq
)
# Make entropy the right shape
self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.log_probs = tf.reduce_sum(
(tf.identity(self.all_log_probs)), axis=1, keepdims=True
)
def create_dc_actor(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_stream = LearningModel.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
stream_scopes=["policy"],
)[0]
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden_policy = tf.concat([hidden_stream, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
hidden_policy, memory_policy_out = LearningModel.create_recurrent_encoder(
hidden_policy,
self.memory_in[:, :_half_point],
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = memory_policy_out
else:
hidden_policy = hidden_stream
policy_branches = []
for size in self.act_size:
policy_branches.append(
tf.layers.dense(
hidden_policy,
size,
activation=None,
use_bias=False,
kernel_initializer=LearningModel.scaled_init(0.01),
)
)
self.all_log_probs = tf.concat(policy_branches, axis=1, name="action_probs")
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
output, _, normalized_logits = LearningModel.create_discrete_action_masking_layer(
self.all_log_probs, self.action_masks, self.act_size
)
self.output = tf.identity(output)
self.normalized_logits = tf.identity(normalized_logits, name="action")
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
)
self.action_oh = tf.concat(
[
tf.one_hot(self.action_holder[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
self.selected_actions = tf.stop_gradient(self.action_oh)
action_idx = [0] + list(np.cumsum(self.act_size))
self.entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
)
self.log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)

6
ml-agents/mlagents/trainers/rl_trainer.py


from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignalResult
LOGGER = logging.getLogger("mlagents.trainers")

rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
policy.optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0)
policy.optimizer.reward_signals[name].stat_name,
rewards.get(agent_id, 0),
)
rewards[agent_id] = 0

2
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.models import EncoderType, LearningModel
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.brain import BrainParameters, CameraResolution
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents_envs.environment import UnityEnvironment

120
ml-agents/mlagents/trainers/tf_policy.py


from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.env_manager import get_global_agent_id
from mlagents_envs.base_env import BatchedStepResult
from mlagents.trainers.models import LearningModel
logger = logging.getLogger("mlagents.trainers")

:param brain: The corresponding Brain for this policy.
:param trainer_parameters: The trainer parameters.
"""
self.m_size = None
self.model = None
self._version_number_ = 2
self.m_size = 0
self.act_size = brain.vector_action_space_size
self.vec_obs_size = brain.vector_observation_space_size
self.vis_obs_size = brain.number_visual_observations
self.use_recurrent = trainer_parameters["use_recurrent"]
self.memory_dict: Dict[str, np.ndarray] = {}
self.reward_signals: Dict[str, "RewardSignal"] = {}

# to prevent from exceptions if the device doesn't suppport the operation
# or the device does not exist
config.allow_soft_placement = True
tf.set_random_seed(seed)
self.sess = tf.Session(config=config, graph=self.graph)
self.saver = None
self.optimizer = None

brain.brain_name, self.m_size
)
)
self._initialize_tensorflow_references()
def _initialize_graph(self):
with self.graph.as_default():

def fill_eval_dict(self, feed_dict, batched_step_result):
vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
for i, _ in enumerate(vec_vis_obs.visual_observations):
feed_dict[self.model.visual_in[i]] = vec_vis_obs.visual_observations[i]
feed_dict[self.visual_in[i]] = vec_vis_obs.visual_observations[i]
feed_dict[self.model.vector_in] = vec_vis_obs.vector_observations
feed_dict[self.vector_in] = vec_vis_obs.vector_observations
if not self.use_continuous_act:
mask = np.ones(
(

)
if batched_step_result.action_mask is not None:
mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
feed_dict[self.model.action_masks] = mask
feed_dict[self.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):

Gets current model step.
:return: current model step.
"""
step = self.sess.run(self.model.global_step)
step = self.sess.run(self.global_step)
return step
def increment_step(self, n_steps):

out_dict = {
"global_step": self.model.global_step,
"increment_step": self.model.increment_step,
"global_step": self.global_step,
"increment_step": self.increment_step_op,
feed_dict = {self.model.steps_to_increment: n_steps}
feed_dict = {self.steps_to_increment: n_steps}
return self.sess.run(out_dict, feed_dict=feed_dict)["global_step"]
def get_inference_vars(self):

"""
if self.use_vec_obs and self.normalize:
self.sess.run(
self.model.update_normalization,
feed_dict={self.model.vector_in: vector_obs},
self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
def vis_obs_size(self):
return self.model.vis_obs_size
@property
def vec_obs_size(self):
return self.model.vec_obs_size
@property
return self.model.vis_obs_size > 0
return self.vis_obs_size > 0
return self.model.vec_obs_size > 0
return self.vec_obs_size > 0
def _initialize_tensorflow_references(self):
with self.graph.as_default():
self.value_heads: Dict[str, tf.Tensor] = {}
self.normalization_steps: Optional[tf.Variable] = None
self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.global_step, self.increment_step_op, self.steps_to_increment = (
LearningModel.create_global_steps()
)
self.visual_in = LearningModel.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_in = LearningModel.create_vector_input(self.vec_obs_size)
if self.normalize:
normalization_tensors = LearningModel.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors[0]
self.normalization_steps = normalization_tensors[1]
self.running_mean = normalization_tensors[2]
self.running_variance = normalization_tensors[3]
self.processed_vector_in = LearningModel.normalize_vector_obs(
self.vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_in = self.vector_in
self.update_normalization_op = None
self.batch_size_ph = tf.placeholder(
shape=None, dtype=tf.int32, name="batch_size"
)
self.sequence_length_ph = tf.placeholder(
shape=None, dtype=tf.int32, name="sequence_length"
)
self.mask_input = tf.placeholder(
shape=[None], dtype=tf.float32, name="masks"
)
self.mask = tf.cast(self.mask_input, tf.int32)
tf.Variable(
int(self.brain.vector_action_space_type == "continuous"),
name="is_continuous_control",
trainable=False,
dtype=tf.int32,
)
tf.Variable(
self._version_number_,
name="version_number",
trainable=False,
dtype=tf.int32,
)
tf.Variable(
self.m_size, name="memory_size", trainable=False, dtype=tf.int32
)
if self.brain.vector_action_space_type == "continuous":
tf.Variable(
self.act_size[0],
name="action_output_shape",
trainable=False,
dtype=tf.int32,
)
else:
tf.Variable(
sum(self.act_size),
name="action_output_shape",
trainable=False,
dtype=tf.int32,
)

10
ml-agents/mlagents/trainers/trainer.py


from collections import deque
from mlagents_envs.exception import UnityException
from mlagents_envs.timers import set_gauge
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.stats import StatsReporter

from mlagents.trainers.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
class UnityTrainerException(UnityException):
"""
Related to errors with the Trainer.
"""
pass
class Trainer(abc.ABC):

3
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.exception import TrainerConfigError
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.sac.trainer import SACTrainer

正在加载...
取消
保存