浏览代码

integrate the implementation and hyperparameters

/develop/bisim-review
yanchaosun 4 年前
当前提交
ac4c80c2
共有 6 个文件被更改,包括 386 次插入210 次删除
  1. 1
      config/ppo_transfer/3DBallHard.yaml
  2. 5
      ml-agents/mlagents/trainers/models.py
  3. 280
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  4. 181
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  5. 91
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  6. 38
      ml-agents/mlagents/trainers/settings.py

1
config/ppo_transfer/3DBallHard.yaml


lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
conv_thres: 1e-3
network_settings:
normalize: true
hidden_units: 128

5
ml-agents/mlagents/trainers/models.py


num_layers: int,
vis_encode_type: EncoderType = EncoderType.SIMPLE,
stream_scopes: List[str] = None,
reuse: bool = False
) -> List[tf.Tensor]:
"""
Creates encoding stream for observations.

activation_fn,
num_layers,
f"{_scope_add}main_graph_{i}_encoder{j}", # scope
False, # reuse
reuse=reuse, # reuse
)
visual_encoders.append(encoded_visual)
hidden_visual = tf.concat(visual_encoders, axis=1)

activation_fn,
num_layers,
scope=f"{_scope_add}main_graph_{i}",
reuse=False,
reuse=reuse,
)
if hidden_state is not None and hidden_visual is not None:
final_hidden = tf.concat([hidden_visual, hidden_state], axis=1)

280
ml-agents/mlagents/trainers/policy/transfer_policy.py


def __init__(
self,
encoded: tf.Tensor,
feature_size: int
feature_size: int,
reuse: bool=False
):
self.mu = tf.layers.dense(
encoded,

kernel_initializer=ModelUtils.scaled_init(0.01),
reuse=tf.AUTO_REUSE,
reuse=reuse,
)
self.log_sigma = tf.layers.dense(

name="log_std",
kernel_initializer=ModelUtils.scaled_init(0.01),
reuse=reuse
)
self.sigma = tf.exp(self.log_sigma)

def create_tf_graph(self,
encoder_layers = 1,
policy_layers = 1,
policy_units = 128,
inverse_model=False
inverse_model=False,
reuse_encoder=False,
self.reuse_encoder = transfer
with self.graph.as_default():
tf.set_random_seed(self.seed)
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

)
self.next_visual_in: List[tf.Tensor] = []
with tf.variable_scope("encoding"):
self.encoder, self.targ_encoder = self.create_encoders()
with tf.variable_scope("inverse"):
self.create_inverse_model(self.encoder, self.targ_encoder)
with tf.variable_scope("predict"):
self.create_forward_model(self.encoder, self.targ_encoder)
# self.encoder_distribution, self.encoder = self._create_var_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# _, self.targ_encoder = self._create_var_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# self.encoder, self.targ_encoder, self.encoder_distribution, _ = self.create_encoders(var_latent=True, reuse_encoder=reuse_encoder)
# self.encoder = self._create_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# self.encoder, self.targ_encoder = self.create_encoders(reuse_encoder=reuse_encoder)
# if not reuse_encoder:
# self.targ_encoder = tf.stop_gradient(self.targ_encoder)
# self._create_hard_copy()
if var_encoder:
self.encoder_distribution, self.encoder = self._create_var_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type
)
# self.targ_encoder = self._create_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
_, self.targ_encoder = self._create_var_target_encoder(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
reuse_encoder
)
else:
self.encoder = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type
)
# self._create_hard_copy()
self.targ_encoder = self._create_target_encoder(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
reuse_encoder
)
if not reuse_encoder:
self.targ_encoder = tf.stop_gradient(self.targ_encoder)
self._create_hard_copy()
with tf.variable_scope("inverse"):
self.create_inverse_model(self.encoder, self.targ_encoder)
with tf.variable_scope("predict"):
self.create_forward_model(self.encoder, self.targ_encoder)
# if var_predict:
# self.predict_distribution, self.predict = self._create_var_world_model(

if self.use_continuous_act:
self._create_cc_actor(
self.encoder,
policy_units,
self.h_size,
policy_layers,
self.tanh_squash,
self.reparameterize,

else:
self._create_dc_actor(self.encoder, policy_units, policy_layers, separate_train)
self._create_dc_actor(self.encoder, self.h_size, policy_layers, separate_train)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"

self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
) # LSTMs need to be root scope for Barracuda export
if not transfer:
if self.inverse_model:
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse"
)

"""
with self.graph.as_default():
with tf.variable_scope("predict"):
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
# self.current_action = tf.placeholder(
# shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
# )
hidden_stream = ModelUtils.create_vector_observation_encoder(
tf.concat([encoder, self.current_action], axis=1),
h_size,

)
with tf.variable_scope("latent"):
if predict_return:
predict_distribution = GaussianEncoderDistribution(
predict_distribution = GaussianEncoderDistribution(
# separate prediction of return
feature_size
feature_sizex
)
predict = predict_distribution.sample()

feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
reuse_encoder: bool
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)

else:
self.processed_vector_next = self.vector_next
with tf.variable_scope("target_enc"):
with tf.variable_scope(next_encoder_scope):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,

vis_encode_type,
reuse=reuse_encoder
name="latent"
name="latent",
reuse=reuse_encoder
return tf.stop_gradient(latent_targ)
return latent_targ
# return tf.stop_gradient(latent_targ)
def _create_encoder(
self,

feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
reuse_encoder: bool
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)

else:
self.processed_vector_next = self.vector_next
with tf.variable_scope("target_enc"):
with tf.variable_scope(next_encoder_scope):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,

latent_targ = latent_targ_distribution.sample()
return latent_targ_distribution, tf.stop_gradient(latent_targ)
return latent_targ_distribution, latent_targ
def _create_var_encoder(
self,

:param steps: The number of steps the model was trained for
:return:
"""
self.get_policy_weights()
with self.graph.as_default():
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)

encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
encoding_saver.save(self.sess, encoding_checkpoint)
# latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
# latent_saver = tf.train.Saver(latent_vars)
# latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt")
# latent_saver.save(self.sess, latent_checkpoint)
latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
latent_saver = tf.train.Saver(latent_vars)
latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt")
latent_saver.save(self.sess, latent_checkpoint)
predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
predict_saver = tf.train.Saver(predict_vars)

def get_encoder_weights(self):
with self.graph.as_default():
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/main_graph_0/hidden_0/bias:0")
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/main_graph_0/hidden_0/bias:0")
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0")
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0")
def get_policy_weights(self):
with self.graph.as_default():
pol = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "policy/mu/bias:0")
print("policy:", self.sess.run(pol))
def create_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
if self.vis_obs_size > 0:
self.next_visual_in = []
visual_encoders = []

next_visual_input = ModelUtils.create_visual_input(
self.brain.camera_resolutions[i],
name="curiosity_next_visual_observation_" + str(i),
name="next_visual_observation_" + str(i),
encoded_visual = ModelUtils.create_visual_observation_encoder(
self.visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_stream_{}_visual_obs_encoder".format(i),
False,
)
with tf.variable_scope("encoding"):
encoded_visual = ModelUtils.create_visual_observation_encoder(
self.visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
reuse_encoder
)
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_stream_{}_visual_obs_encoder".format(i),
True,
)
visual_encoders.append(encoded_visual)
next_visual_encoders.append(encoded_next_visual)

self.next_vector_in = tf.placeholder(
shape=[None, self.vec_obs_size],
dtype=tf.float32,
name="curiosity_next_vector_observation",
name="next_vector_observation",
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_vector_obs_encoder",
False,
)
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
self.next_vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_vector_obs_encoder",
True,
)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.next_vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.next_vector_in
with tf.variable_scope("encoding"):
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
self.processed_vector_next,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
reuse_encoder
)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)

encoded_state = tf.layers.dense(
if var_latent:
with tf.variable_scope("encoding/latent"):
encoded_state_dist = GaussianEncoderDistribution(
name="latent"
encoded_next_state = tf.layers.dense(
encoded_state = encoded_state_dist.sample()
with tf.variable_scope(next_encoder_scope+"/latent"):
encoded_next_state_dist = GaussianEncoderDistribution(
name="latent",
reuse=True
reuse=reuse_encoder
encoded_next_state = encoded_next_state_dist.sample()
return encoded_state, encoded_next_state, encoded_state_dist, encoded_next_state_dist
else:
with tf.variable_scope("encoding"):
encoded_state = tf.layers.dense(
encoded_state,
self.feature_size,
name="latent"
)
with tf.variable_scope(next_encoder_scope):
encoded_next_state = tf.layers.dense(
encoded_next_state,
self.feature_size,
name="latent",
reuse=reuse_encoder
)
return encoded_state, encoded_next_state
return encoded_state, encoded_next_state
def create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor

squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(self.predict, encoded_next_state), axis=1
)
self.intrinsic_reward = squared_difference
# self.intrinsic_reward = squared_difference
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)

181
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


from mlagents.trainers.policy.transfer_policy import TransferPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.settings import TrainerSettings, PPOSettings, PPOTransferSettings
import tf_slim as slim
class PPOTransferOptimizer(TFOptimizer):

:param policy: A TFPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
"""
self.separate_value_train = False
self.separate_policy_train = False
self.use_var_encoder = False
self.use_var_predict = False
self.with_prior = False
self.use_inverse_model = True
self.predict_return = False
hyperparameters: PPOTransferSettings = cast(
PPOTransferSettings, trainer_params.hyperparameters
)
self.use_alter = False
self.in_batch_alter = False
self.in_epoch_alter = False
self.separate_value_train = hyperparameters.separate_value_train
self.separate_policy_train = hyperparameters.separate_policy_train
self.use_var_encoder = hyperparameters.use_var_encoder
self.use_var_predict = hyperparameters.use_var_predict
self.with_prior = hyperparameters.with_prior
self.use_inverse_model = hyperparameters.use_inverse_model
self.predict_return = hyperparameters.predict_return
self.reuse_encoder = hyperparameters.reuse_encoder
self.use_alter = hyperparameters.use_alter
self.in_batch_alter = hyperparameters.in_batch_alter
self.in_epoch_alter = hyperparameters.in_epoch_alter
self.num_updates = 0
self.alter_every = 400
self.copy_every = 1
self.train_type = "all"
self.train_type = hyperparameters.train_type
self.use_transfer = False
self.smart_transfer = False
self.conv_thres = 1e-6
self.old_loss = np.inf
self.update_mode = "model"
self.transfer_path = "results/BallSingle_nosep_cmodel_small/3DBall"
self.transfer_type = "dynamics"
self.use_transfer = hyperparameters.use_transfer
self.transfer_path = hyperparameters.transfer_path #"results/BallSingle_nosep_cmodel_small/3DBall"
self.smart_transfer = hyperparameters.smart_transfer
self.conv_thres = hyperparameters.conv_thres
self.transfer_type = hyperparameters.transfer_type
policy.create_tf_graph(1, 1, 128, self.use_transfer, self.separate_policy_train,
self.use_var_encoder, self.use_var_predict, self.predict_return, self.use_inverse_model)
policy.create_tf_graph(hyperparameters.encoder_layers, hyperparameters.policy_layers,
self.use_transfer, self.separate_policy_train, self.use_var_encoder, self.use_var_predict,
self.predict_return, self.use_inverse_model, self.reuse_encoder)
hyperparameters: PPOSettings = cast(
PPOSettings, trainer_params.hyperparameters
)
lr = float(hyperparameters.learning_rate)
self._schedule = hyperparameters.learning_rate_schedule

num_layers = policy_network_settings.num_layers
vis_encode_type = policy_network_settings.vis_encode_type
self.burn_in_ratio = 0.0
self.num_updates = 0
self.alter_every = 400
self.copy_every = 10
self.old_loss = np.inf
self.update_mode = "model"
self.stream_names = list(self.reward_signals.keys())

if self.use_transfer:
self.policy.load_graph_partial(self.transfer_path, self.transfer_type)
self.policy.get_encoder_weights()
self.policy.get_policy_weights()
# saver = tf.train.Saver()
# model_checkpoint = os.path.join(self.transfer_path, f"model-4000544.ckpt")
# saver.restore(self.sess, model_checkpoint)

self.dis_returns = tf.placeholder(
shape=[None], dtype=tf.float32, name="dis_returns"
)
# target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
# if self.predict_return:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# else:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# if self.with_prior:
# if self.use_var_encoder:
# self.model_loss += encoder_distribution.kl_standard()
# if self.use_var_predict:
# self.model_loss += self.policy.predict_distribution.kl_standard()
target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
if self.predict_return:
self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
else:
self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
if self.with_prior:
if self.use_var_encoder:
self.model_loss += encoder_distribution.kl_standard()
if self.use_var_predict:
self.model_loss += self.policy.predict_distribution.kl_standard()
# if self.use_inverse_model:
# self.model_loss += self.policy.inverse_loss
self.model_loss = 0.2 * self.policy.forward_loss + 0.8 * self.policy.inverse_loss
if self.use_inverse_model:
self.model_loss += self.policy.inverse_loss
# self.model_loss = 0.2 * self.policy.forward_loss + 0.8 * self.policy.inverse_loss
self.loss = (
self.policy_loss
+ self.model_loss

train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += self.policy.get_trainable_variables
print("trainable", train_vars)
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")

# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
elif self.transfer_type == "observation":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "policy":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
train_vars += self.policy.get_trainable_variables()
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
print("trainable", train_vars)
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)

def _init_alter_update(self):
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.use_alter:
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
policy_train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.loss, var_list=policy_train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.loss, var_list=policy_train_vars)
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.loss, var_list=model_train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.loss, var_list=model_train_vars)
else:
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
elif self.train_type == "policy":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.ppo_update_dict.update(
{

stats_needed.update(reward_signal.stats_name_to_update_name)
if self.use_alter:
if self.num_updates / self.alter_every == 0:
update_vals = self._execute_model(feed_dict, self.update_dict)
if self.num_updates % self.alter_every == 0:
print("start update all", self.num_updates)
elif (self.num_updates / self.alter_every) % 2 == 1:
# if self.num_updates / self.alter_every == 0:
# update_vals = self._execute_model(feed_dict, self.update_dict)
# if self.num_updates % self.alter_every == 0:
# print("start update all", self.num_updates)
if (self.num_updates / self.alter_every) % 2 == 0:
stats_needed = {
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
stats_needed = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
if self.num_updates % self.alter_every == 0:
print("start update policy", self.num_updates)

update_vals = self._execute_model(feed_dict, self.update_dict)
# update target encoder
# if self.num_updates % self.copy_every == 0:
# self.policy.run_hard_copy()
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
self.policy.run_hard_copy()
# print("copy")
# self.policy.get_encoder_weights()

update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
# update target encoder
# self.policy.run_hard_copy()
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
self.policy.run_hard_copy()
self.num_updates += 1
return update_stats
def _construct_feed_dict(

self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
# self.policy.processed_vector_next: mini_batch["next_vector_in"],
self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.processed_vector_next: mini_batch["next_vector_in"],
# self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.dis_returns: mini_batch["discounted_returns"]
}

91
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


from mlagents.trainers.ppo_transfer.optimizer import PPOTransferOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings
from mlagents.trainers.settings import TrainerSettings, PPOSettings, PPOTransferSettings
BUFFER_TRUNCATE_PERCENT = 0.6
logger = get_logger(__name__)

super(PPOTransferTrainer, self).__init__(
brain_name, trainer_settings, training, artifact_path, reward_buff_cap
)
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
self.hyperparameters: PPOTransferSettings = cast(
PPOTransferSettings, self.trainer_settings.hyperparameters
self.use_iealter = False
self.use_iealter = self.hyperparameters.in_epoch_alter
self.use_op_buffer = self.hyperparameters.use_op_buffer
self.conv_thres = self.hyperparameters.conv_thres
self.num_check = 0
self.train_model = True
self.old_loss = np.inf
print("The current algorithm is PPO Transfer")
def _process_trajectory(self, trajectory: Trajectory) -> None:

self.update_buffer, training_length=self.policy.sequence_length
)
# the off-policy buffer
if self.use_iealter:
if self.use_op_buffer:
agent_buffer_trajectory.resequence_and_append(
self.off_policy_buffer, training_length=self.policy.sequence_length
)

Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
if self.use_iealter:
size_of_buffer = self.off_policy_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
else:
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
# if self.train_model and self.use_op_buffer:
# size_of_buffer = self.off_policy_buffer.num_experiences
# self.num_check += 1
# if self.num_check % 50 == 0 and size_of_buffer >= self.hyperparameters.buffer_size:
# return True
# else:
# return False
# else:
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy(self):
"""

if self.use_iealter:
if self.train_model and self.use_op_buffer:
if self.update_buffer.num_experiences < self.hyperparameters.buffer_size:
return True
# if self.update_buffer.num_experiences < self.hyperparameters.buffer_size:
# return True
buffer_length = self.update_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()

num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_iealter:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))

n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.off_policy_buffer["advantages"].get_batch()
self.off_policy_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)

batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
self.train_model = False
else:
self.old_loss = np.mean(stat_list)
print(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()

# self.off_policy_buffer.reset_agent()
if self.off_policy_buffer.num_experiences > self.hyperparameters.buffer_size:
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
print("truncate")
int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
int(5 * self.hyperparameters.buffer_size)
)
return True

38
ml-agents/mlagents/trainers/settings.py


def check_and_structure(key: str, value: Any, class_type: type) -> Any:
attr_fields_dict = attr.fields_dict(class_type)
print(attr_fields_dict)
if key not in attr_fields_dict:
raise TrainerConfigError(
f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid."

lambd: float = 0.95
num_epoch: int = 3
learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
@attr.s(auto_attribs=True)
class PPOTransferSettings(HyperparamSettings):
beta: float = 5.0e-3
epsilon: float = 0.2
lambd: float = 0.95
num_epoch: int = 3
learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
separate_value_train: bool = False
separate_policy_train: bool = False
use_var_encoder: bool = False
use_var_predict: bool = False
with_prior: bool = False
use_inverse_model: bool = False
predict_return: bool = False
reuse_encoder: bool = False
use_alter: bool = False
in_batch_alter: bool = False
in_epoch_alter: bool = False
use_op_buffer: bool = False
train_type: str = "all"
# Transfer
use_transfer: bool = False
smart_transfer: bool = False
conv_thres: float = 1e-3
transfer_path: str = ""
transfer_type: str = "dynamics"
# Network
encoder_layers: int = 1
policy_layers: int = 1
forward_layers: int = 1
inverse_layers: int = 1
@attr.s(auto_attribs=True)

def to_settings(self) -> type:
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings,
TrainerType.PPO_Transfer: PPOSettings}
TrainerType.PPO_Transfer: PPOTransferSettings}
return _mapping[self]

正在加载...
取消
保存