浏览代码

Move optimizer creation to Trainer, fix some of the reward signals

/develop/nopreviousactions
Ervin Teng 4 年前
当前提交
164732a9
共有 13 个文件被更改,包括 197 次插入220 次删除
  1. 7
      ml-agents/mlagents/trainers/components/bc/model.py
  2. 34
      ml-agents/mlagents/trainers/components/bc/module.py
  3. 16
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  4. 54
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  5. 18
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  6. 14
      ml-agents/mlagents/trainers/optimizer.py
  7. 105
      ml-agents/mlagents/trainers/ppo/optimizer.py
  8. 115
      ml-agents/mlagents/trainers/ppo/policy.py
  9. 26
      ml-agents/mlagents/trainers/ppo/trainer.py
  10. 7
      ml-agents/mlagents/trainers/rl_trainer.py
  11. 12
      ml-agents/mlagents/trainers/tests/test_ppo.py
  12. 2
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  13. 7
      ml-agents/mlagents/trainers/tf_policy.py

7
ml-agents/mlagents/trainers/components/bc/model.py


from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel
from mlagents.trainers.tf_policy import TFPolicy
self,
policy_model: LearningModel,
learning_rate: float = 3e-4,
anneal_steps: int = 0,
self, policy_model: TFPolicy, learning_rate: float = 3e-4, anneal_steps: int = 0
):
"""
Tensorflow operations to perform Behavioral Cloning on a Policy model

34
ml-agents/mlagents/trainers/components/bc/module.py


"""
self.policy = policy
self.current_lr = policy_learning_rate * strength
self.model = BCModel(policy.model, self.current_lr, steps)
self.model = BCModel(policy, self.current_lr, steps)
_, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length)
self.batch_size = batch_size if batch_size else default_batch_size

Helper function for update_batch.
"""
feed_dict = {
self.policy.model.batch_size: n_sequences,
self.policy.model.sequence_length: self.policy.sequence_length,
self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length: self.policy.sequence_length,
if self.policy.model.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.model.epsilon] = np.random.normal(
size=(1, self.policy.model.act_size[0])
if self.policy.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.epsilon] = np.random.normal(
size=(1, self.policy.act_size[0])
feed_dict[self.policy.model.action_masks] = np.ones(
feed_dict[self.policy.action_masks] = np.ones(
sum(self.policy.model.brain.vector_action_space_size),
sum(self.policy.brain.vector_action_space_size),
if self.policy.model.brain.vector_observation_space_size > 0:
feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.model.visual_in):
feed_dict[self.policy.model.visual_in[i]] = mini_batch_demo[
"visual_obs%d" % i
]
if self.policy.brain.vector_observation_space_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch_demo["visual_obs%d" % i]
feed_dict[self.policy.model.memory_in] = np.zeros(
feed_dict[self.policy.memory_in] = np.zeros(
if not self.policy.model.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.model.prev_action] = mini_batch_demo[
"prev_action"
]
if not self.policy.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.prev_action] = mini_batch_demo["prev_action"]
network_out = self.policy.sess.run(
list(self.out_dict.values()), feed_dict=feed_dict
)

16
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.model.batch_size: len(mini_batch["actions"]),
self.policy.model.sequence_length: self.policy.sequence_length,
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
if self.policy.model.vis_obs_size > 0:
for i in range(len(self.policy.model.visual_in)):
if self.policy.vis_obs_size > 0:
for i in range(len(self.policy.visual_in)):
feed_dict[self.policy.model.visual_in[i]] = _obs
feed_dict[self.policy.visual_in[i]] = _obs
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
feed_dict[self.policy.action_holder] = mini_batch["actions"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

54
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


from mlagents.tf_utils import tf
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
EPSILON = 1e-7

def __init__(
self,
policy_model: LearningModel,
policy: TFPolicy,
h_size: int = 128,
learning_rate: float = 3e-4,
encoding_size: int = 64,

self.z_size = 128
self.alpha = 0.0005
self.mutual_information = 0.5
self.policy_model = policy_model
self.policy = policy
self.encoding_size = encoding_size
self.gradient_penalty_weight = gradient_penalty_weight
self.use_vail = use_vail

self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
self.done_policy = tf.expand_dims(self.done_policy_holder, -1)
if self.policy_model.brain.vector_action_space_type == "continuous":
action_length = self.policy_model.act_size[0]
if self.policy.brain.vector_action_space_type == "continuous":
action_length = self.policy.act_size[0]
action_length = len(self.policy_model.act_size)
action_length = len(self.policy.act_size)
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.int32
)

for i, act_size in enumerate(self.policy_model.act_size)
for i, act_size in enumerate(self.policy.act_size)
],
axis=1,
)

if self.policy_model.vec_obs_size > 0:
if self.policy.vec_obs_size > 0:
shape=[None, self.policy_model.vec_obs_size], dtype=tf.float32
shape=[None, self.policy.vec_obs_size], dtype=tf.float32
if self.policy_model.normalize:
if self.policy.normalize:
self.policy_model.normalize_vector_obs(self.obs_in_expert)
)
encoded_policy_list.append(
self.policy_model.normalize_vector_obs(self.policy_model.vector_in)
LearningModel.normalize_vector_obs(
self.obs_in_expert,
self.policy.running_mean,
self.policy.running_variance,
self.policy.normalization_steps,
)
encoded_policy_list.append(self.policy.processed_vector_in)
encoded_policy_list.append(self.policy_model.vector_in)
encoded_policy_list.append(self.policy.vector_in)
if self.policy_model.vis_obs_size > 0:
if self.policy.vis_obs_size > 0:
for i in range(self.policy_model.vis_obs_size):
for i in range(self.policy.vis_obs_size):
visual_input = self.policy_model.create_visual_input(
self.policy_model.brain.camera_resolutions[i],
visual_input = LearningModel.create_visual_input(
self.policy.brain.camera_resolutions[i],
encoded_policy_visual = self.policy_model.create_visual_observation_encoder(
self.policy_model.visual_in[i],
encoded_policy_visual = LearningModel.create_visual_observation_encoder(
self.policy.visual_in[i],
self.encoding_size,
LearningModel.swish,
1,

encoded_expert_visual = self.policy_model.create_visual_observation_encoder(
encoded_expert_visual = LearningModel.create_visual_observation_encoder(
self.expert_visual_in[i],
self.encoding_size,
LearningModel.swish,

)
self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
self.encoded_policy,
self.policy_model.selected_actions,
self.policy.selected_actions,
self.done_policy,
reuse=True,
)

for off-policy. Compute gradients w.r.t randomly interpolated input.
"""
expert = [self.encoded_expert, self.expert_action, self.done_expert]
policy = [
self.encoded_policy,
self.policy_model.selected_actions,
self.done_policy,
]
policy = [self.encoded_policy, self.policy.selected_actions, self.done_policy]
interp = []
for _expert_in, _policy_in in zip(expert, policy):
alpha = tf.random_uniform(tf.shape(_expert_in))

18
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


self.use_terminal_states = False
self.model = GAILModel(
policy.model, 128, learning_rate, encoding_size, use_actions, use_vail
policy, 128, learning_rate, encoding_size, use_actions, use_vail
)
_, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length)
self.has_updated = False

def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.model.batch_size: len(mini_batch["actions"]),
self.policy.model.sequence_length: self.policy.sequence_length,
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
if self.policy.model.vis_obs_size > 0:
for i in range(len(self.policy.model.visual_in)):
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
if self.policy.vis_obs_size > 0:
for i in range(len(self.policy.visual_in)):
feed_dict[self.policy.model.visual_in[i]] = _obs
feed_dict[self.policy.visual_in[i]] = _obs
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
feed_dict[self.policy.action_holder] = mini_batch["actions"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

14
ml-agents/mlagents/trainers/optimizer.py


from mlagents.tf_utils.tf import tf
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
from mlagents.trainers.policy import Policy
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,

"""
@abc.abstractmethod
def __init__(self, policy: LearningModel, optimizer_parameters: Dict[str, Any]):
def __init__(self, policy: Policy):
"""
Create loss functions and auxillary networks.
"""

class TFOptimizer(Optimizer, abc.ABC): # pylint: disable=W0223
def __init__(
self, sess: tf.Session, policy: TFPolicy, reward_signal_configs: Dict[str, Any]
):
super().__init__(policy, reward_signal_configs)
self.sess = sess
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
super().__init__(policy)
self.sess = policy.sess
self.create_reward_signals(reward_signal_configs)
self.create_reward_signals(trainer_params["reward_signals"])
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {

105
ml-agents/mlagents/trainers/ppo/optimizer.py


class PPOOptimizer(TFOptimizer):
def __init__(self, brain, sess, policy, reward_signal_configs, trainer_params):
def __init__(self, policy, trainer_params):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

:param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
:return: a sub-class of PPOAgent tailored to the environment.
"""
super().__init__(sess, policy, reward_signal_configs)
with policy.graph.as_default():
with tf.variable_scope("optimizer/"):
super().__init__(policy, trainer_params)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])
epsilon = float(trainer_params["epsilon"])
beta = float(trainer_params["beta"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
lr = float(trainer_params["learning_rate"])
lr_schedule = LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
)
h_size = int(trainer_params["hidden_units"])
epsilon = float(trainer_params["epsilon"])
beta = float(trainer_params["beta"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(trainer_params.get("vis_encode_type", "simple"))
self.stream_names = self.reward_signals.keys()
self.stream_names = self.reward_signals.keys()
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
if num_layers < 1:
num_layers = 1
if policy.use_continuous_act:
self.create_cc_critic(h_size, num_layers, vis_encode_type)
else:
self.create_dc_critic(h_size, num_layers, vis_encode_type)
if num_layers < 1:
num_layers = 1
if brain.vector_action_space_type == "continuous":
self.create_cc_critic(h_size, num_layers, vis_encode_type)
else:
self.create_dc_critic(h_size, num_layers, vis_encode_type)
self.learning_rate = LearningModel.create_learning_rate(
lr_schedule, lr, self.policy.global_step, max_step
)
self.create_losses(
self.policy.log_probs,
self.old_log_probs,
self.value_heads,
self.policy.entropy,
beta,
epsilon,
lr,
max_step,
)
self.create_ppo_optimizer()
self.learning_rate = LearningModel.create_learning_rate(
lr_schedule, lr, self.policy.global_step, max_step
)
self.create_losses(
self.policy.log_probs,
self.old_log_probs,
self.value_heads,
self.policy.entropy,
beta,
epsilon,
lr,
max_step,
)
self.create_ppo_optimizer()
self.update_dict.update(
{
"value_loss": self.value_loss,
"policy_loss": self.abs_policy_loss,
"update_batch": self.update_batch,
}
)
self.update_dict.update(
{
"value_loss": self.value_loss,
"policy_loss": self.abs_policy_loss,
"update_batch": self.update_batch,
}
)
# Add some stuff to inference dict from optimizer
self.policy.inference_dict["learning_rate"] = self.learning_rate
if self.policy.use_recurrent:
self.policy.inference_dict["optimizer_memory_out"]: self.memory_out
def create_cc_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType

h_size,
num_layers,
vis_encode_type,
stream_scopes=["optimizer"],
)[0]
if self.policy.use_recurrent:

h_size,
num_layers,
vis_encode_type,
stream_scopes=["optimizer"],
)[0]
if self.policy.use_recurrent:

115
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import LearningModel
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule

:param is_training: Whether the model should be trained.
:param load: Whether a pre-trained model will be loaded or a new one created.
"""
super().__init__(seed, brain, trainer_params)
with tf.variable_scope("policy"):
super().__init__(seed, brain, trainer_params)
reward_signal_configs = trainer_params["reward_signals"]
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
vis_encode_type = EncoderType(trainer_params.get("vis_encode_type", "simple"))
self.optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = trainer_params["num_layers"]
h_size = trainer_params["hidden_units"]
if num_layers < 1:
num_layers = 1
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")
)
with self.graph.as_default():
if brain.vector_action_space_type == "continuous":
self.create_cc_actor(h_size, num_layers, vis_encode_type)
else:
self.create_dc_actor(h_size, num_layers, vis_encode_type)
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=3,
**trainer_params["behavioral_cloning"],
)
with self.graph.as_default():
if self.use_continuous_act:
self.create_cc_actor(h_size, num_layers, vis_encode_type)
else:
self.create_dc_actor(h_size, num_layers, vis_encode_type)
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=3,
**trainer_params["behavioral_cloning"],
)
self.create_optimizer(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.inference_dict: Dict[str, tf.Tensor] = {
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["policy_memory_out"] = self.memory_out
self.load = load
if load:
def initialize_or_load(self):
if self.load:
def create_optimizer(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):
"""
Create PPO model
:param brain: Assigned Brain object.
:param trainer_params: Defined training parameters.
:param reward_signal_configs: Reward signal config
:param seed: Random seed.
"""
with self.graph.as_default():
self.optimizer = PPOOptimizer(
brain=brain,
policy=self,
sess=self.sess,
reward_signal_configs=reward_signal_configs,
trainer_params=trainer_params,
)
self.optimizer.create_ppo_optimizer()
self.inference_dict.update(
{
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
"learning_rate": self.optimizer.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["policy_memory_out"] = self.memory_out
self.inference_dict["optimizer_memory_out"] = self.optimizer.memory_out
@timed
def evaluate(

26
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
logger = logging.getLogger("mlagents.trainers")

self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Get all value estimates
value_estimates = self.policy.optimizer.get_batched_value_estimates(
value_estimates = self.optimizer.get_batched_value_estimates(
self.policy.optimizer.reward_signals[name].value_name, np.mean(v)
self.optimizer.reward_signals[name].value_name, np.mean(v)
value_next = self.policy.optimizer.get_value_estimates(
value_next = self.optimizer.get_value_estimates(
trajectory.next_obs,
agent_id,
trajectory.done_reached and not trajectory.max_step_reached,

self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.policy.optimizer.reward_signals.items():
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward

# Compute GAE and returns
tmp_advantages = []
tmp_returns = []
for name in self.policy.optimizer.reward_signals:
for name in self.optimizer.reward_signals:
bootstrap_value = value_next[name]
local_rewards = agent_buffer_trajectory[

rewards=local_rewards,
value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.policy.optimizer.reward_signals[name].gamma,
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.trainer_parameters["lambd"],
)
local_return = local_advantage + local_value_estimates

# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(
agent_id, self.get_policy(trajectory.behavior_id)
)
self._update_end_episode_stats(agent_id, self.optimizer)
def _is_ready_update(self):
"""

buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for l in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.policy.optimizer.update(
update_stats = self.optimizer.update(
buffer.make_mini_batch(l, l + batch_size), n_sequences
)
for stat_name, value in update_stats.items():

self.load,
)
for _reward_signal in policy.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
return policy
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:

if not isinstance(policy, PPOPolicy):
raise RuntimeError("Non-PPOPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
self.policy.initialize_or_load()
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""

7
ml-agents/mlagents/trainers/rl_trainer.py


from typing import Dict
from collections import defaultdict
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str, policy: TFPolicy) -> None:
def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":

rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
policy.optimizer.reward_signals[name].stat_name,
rewards.get(agent_id, 0),
optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0)
)
rewards[agent_id] = 0

12
ml-agents/mlagents/trainers/tests/test_ppo.py


# Check that the running mean and variance is correct
steps, mean, variance = trainer.policy.sess.run(
[
trainer.policy.model.normalization_steps,
trainer.policy.model.running_mean,
trainer.policy.model.running_variance,
trainer.policy.normalization_steps,
trainer.policy.running_mean,
trainer.policy.running_variance,
]
)

# Check that the running mean and variance is correct
steps, mean, variance = trainer.policy.sess.run(
[
trainer.policy.model.normalization_steps,
trainer.policy.model.running_mean,
trainer.policy.model.running_variance,
trainer.policy.normalization_steps,
trainer.policy.running_mean,
trainer.policy.running_variance,
]
)

2
ml-agents/mlagents/trainers/tests/test_reward_signals.py


def reward_signal_update(policy, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
policy.model, buffer.make_mini_batch(0, 10), 2
policy, buffer.make_mini_batch(0, 10), 2
)
out = policy._execute_model(
feed_dict, policy.reward_signals[reward_signal_name].update_dict

7
ml-agents/mlagents/trainers/tf_policy.py


self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.action_oh: tf.Tensor = None
self.output_pre: Optional[tf.Tensor] = None
self.memory_in: Optional[tf.Tensor] = None
self.global_step, self.increment_step_op, self.steps_to_increment = (
LearningModel.create_global_steps()

)
self.mask_input = tf.placeholder(
shape=[None], dtype=tf.float32, name="masks"
)
# Only needed for PPO, but needed for BC module
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
self.mask = tf.cast(self.mask_input, tf.int32)

正在加载...
取消
保存