浏览代码

working

/develop/bisim-review
Andrew Cohen 4 年前
当前提交
d0133066
共有 5 个文件被更改,包括 593 次插入494 次删除
  1. 446
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 258
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 176
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 153
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  5. 54
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

446
ml-agents/mlagents/trainers/policy/transfer_policy.py


GaussianDistribution,
MultiCategoricalDistribution,
)
def __init__(
self,
encoded: tf.Tensor,
feature_size: int,
reuse: bool=False
):
def __init__(self, encoded: tf.Tensor, feature_size: int, reuse: bool = False):
self.mu = tf.layers.dense(
encoded,
feature_size,

activation=None,
name="log_std",
kernel_initializer=ModelUtils.scaled_init(0.01),
reuse=reuse
reuse=reuse,
kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1)
kl = 0.5 * tf.reduce_sum(
tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1
)
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(self.sigma, another.sigma)
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(
self.sigma, another.sigma
)
class TransferPolicy(TFPolicy):

self.encoder_distribution = None
self.targ_encoder = None
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20

"""
return self.trainable_variables
def create_tf_graph(self,
encoder_layers = 1,
policy_layers = 1,
forward_layers = 1,
inverse_layers = 1,
feature_size = 16,
transfer=False,
separate_train=False,
def create_tf_graph(
self,
encoder_layers=1,
policy_layers=1,
forward_layers=1,
inverse_layers=1,
feature_size=16,
transfer=False,
separate_train=False,
var_predict=False,
predict_return=False,
var_predict=True,
predict_return=True,
reuse_encoder=False,
use_bisim=False
reuse_encoder=True,
use_bisim=True,
) -> None:
"""
Builds the tensorflow graph needed for this policy.

return
self.create_input_placeholders()
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
shape=[None, sum(self.act_size)],
dtype=tf.float32,
name="current_action",
)
self.current_reward = tf.placeholder(
shape=[None], dtype=tf.float32, name="current_reward"

if var_encoder:
self.encoder_distribution, self.encoder = self._create_var_encoder(
self.visual_in,

encoder_layers,
self.vis_encode_type
self.vis_encode_type,
)
_, self.targ_encoder = self._create_var_target_encoder(

self.vis_encode_type,
reuse_encoder
reuse_encoder,
)
else:
self.encoder = self._create_encoder(

self.feature_size,
encoder_layers,
self.vis_encode_type
self.vis_encode_type,
)
self.targ_encoder = self._create_target_encoder(

self.vis_encode_type,
reuse_encoder
reuse_encoder,
self.create_inverse_model(self.encoder, self.targ_encoder, inverse_layers)
self.create_inverse_model(
self.encoder, self.targ_encoder, inverse_layers
)
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers,
var_predict=var_predict)
self.create_forward_model(
self.encoder,
self.targ_encoder,
forward_layers,
var_predict=var_predict,
)
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
self.create_reward_model(
self.encoder, self.targ_encoder, forward_layers
)
self.create_bisim_model(self.h_size, self.feature_size, encoder_layers,
self.vis_encode_type, forward_layers, var_predict, predict_return)
self.create_bisim_model(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
forward_layers,
var_predict,
predict_return,
)
if self.use_continuous_act:
self._create_cc_actor(

self.tanh_squash,
self.reparameterize,
self.condition_sigma_on_obs,
separate_train
separate_train,
self._create_dc_actor(self.encoder, self.h_size, policy_layers, separate_train)
self._create_dc_actor(
self.encoder, self.h_size, policy_layers, separate_train
)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"

self._initialize_graph()
# slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)
def load_graph_partial(self, path: str, transfer_type="dynamics", load_model=True, load_policy=True,
load_value=True):
load_nets = {"dynamics": [],
"observation": ["encoding", "inverse"]}
def load_graph_partial(
self,
path: str,
transfer_type="dynamics",
load_model=True,
load_policy=True,
load_value=True,
):
load_nets = {"dynamics": [], "observation": ["encoding", "inverse"]}
if load_model:
load_nets["dynamics"].append("predict")
if self.predict_return:

load_nets["dynamics"].append("value")
if self.inverse_model:
load_nets["dynamics"].append("inverse")
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net)
variables_to_restore = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, net
)
if transfer_type == "observation":
self.run_hard_copy()

feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
predict_return: bool=False
predict_return: bool = False,
) -> tf.Tensor:
""""
Builds the world model for state prediction

ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
reuse=False,
hidden_stream,
feature_size+1,
name="next_state"
hidden_stream, feature_size + 1, name="next_state"
hidden_stream,
feature_size,
name="next_state"
hidden_stream, feature_size, name="next_state"
@timed
def evaluate(

feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
reuse_encoder: bool
reuse_encoder: bool,
) -> tf.Tensor:
if reuse_encoder:
next_encoder_scope = "encoding"

h_size,
num_layers,
vis_encode_type,
reuse=reuse_encoder
reuse=reuse_encoder,
hidden_stream_targ,
feature_size,
name="latent",
reuse=reuse_encoder,
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
hidden_stream_targ,
feature_size,
name="latent",
reuse=reuse_encoder,
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
def _create_encoder(
self,
visual_in: List[tf.Tensor],

"""
with tf.variable_scope("encoding"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in,
vector_in,
1,
h_size,
num_layers,
vis_encode_type,
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
hidden_stream,
feature_size,
name="latent",
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
hidden_stream,
feature_size,
name="latent",
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
def _create_var_target_encoder(
self,
h_size: int,

reuse_encoder: bool
reuse_encoder: bool,
) -> tf.Tensor:
if reuse_encoder:
next_encoder_scope = "encoding"

h_size,
num_layers,
vis_encode_type,
reuse=reuse_encoder
reuse=reuse_encoder,
hidden_stream_targ,
feature_size,
reuse=reuse_encoder
hidden_stream_targ, feature_size, reuse=reuse_encoder
)
latent_targ = latent_targ_distribution.sample()

h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates a variational encoder for visual and vector observations.

with tf.variable_scope("encoding"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in,
vector_in,
1,
h_size,
num_layers,
vis_encode_type,
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
hidden_stream,
feature_size
hidden_stream, feature_size
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding')
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
with tf.variable_scope("hard_replacement"):
self.target_replace_op = [
tf.assign(t, 0.9 * t + 0.1 * e) for t, e in zip(t_params, e_params)
]
def run_hard_copy(self):
self.sess.run(self.target_replace_op)

"""
with tf.variable_scope("inverse"):
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, self.h_size, activation=ModelUtils.swish)
hidden = tf.layers.dense(
combined_input, self.h_size, activation=ModelUtils.swish
)
pred_action = tf.layers.dense(
hidden, self.act_size[0], activation=None
)
pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.current_action), axis=1
)

self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
)
def _create_cc_actor(
self,
encoded: tf.Tensor,

reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
separate_train: bool = False
separate_train: bool = False,
) -> None:
"""
Creates Continuous control actor-critic model.

self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(
self,
encoded: tf.Tensor,
h_size: int,
num_layers: int,
separate_train: bool = False
self,
encoded: tf.Tensor,
h_size: int,
num_layers: int,
separate_train: bool = False,
) -> None:
"""
Creates Discrete control actor-critic model.

policy_checkpoint = os.path.join(self.model_path, f"policy.ckpt")
policy_saver.save(self.sess, policy_checkpoint)
encoding_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
encoding_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
latent_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent"
)
predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
predict_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "predict"
)
predict_saver = tf.train.Saver(predict_vars)
predict_checkpoint = os.path.join(self.model_path, f"predict.ckpt")
predict_saver.save(self.sess, predict_checkpoint)

value_saver.save(self.sess, value_checkpoint)
if self.inverse_model:
inverse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")
inverse_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "inverse"
)
reward_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
reward_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "reward"
)
def create_target_normalizer(self, vector_obs: tf.Tensor, prefix="vn") -> NormalizerTensors:
def create_target_normalizer(
self, vector_obs: tf.Tensor, prefix="vn"
) -> NormalizerTensors:
prefix+"_normalization_steps",
prefix + "_normalization_steps",
[],
trainable=False,
dtype=tf.int32,

prefix+"vn_running_mean",
prefix + "vn_running_mean",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

prefix+"vn_running_variance",
prefix + "vn_running_variance",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

return NormalizerTensors(
update_normalization, steps, running_mean, running_variance
)
def update_normalization(self, vector_obs: np.ndarray, vector_obs_next: np.ndarray, vector_obs_bisim: np.ndarray) -> None:
def update_normalization(
self,
vector_obs: np.ndarray,
vector_obs_next: np.ndarray,
vector_obs_bisim: np.ndarray,
) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.

self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
)
self.sess.run(
self.vn_update_normalization_op, feed_dict={self.vector_next: vector_obs_next}
self.vn_update_normalization_op,
feed_dict={self.vector_next: vector_obs_next},
self.bi_update_normalization_op, feed_dict={self.vector_bisim: vector_obs_bisim}
self.bi_update_normalization_op,
feed_dict={self.vector_bisim: vector_obs_bisim},
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0")
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0")
enc = tf.get_collection(
tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0"
)
targ = tf.get_collection(
tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0"
)
print("encoding:", self.sess.run(enc))
print("target:", self.sess.run(targ))

rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
print("reward:", self.sess.run(rew))
def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
def create_encoders(
self, var_latent: bool = False, reuse_encoder: bool = False
) -> Tuple[tf.Tensor, tf.Tensor]:
encoded_state_list = []
encoded_next_state_list = []
if reuse_encoder:

"stream_{}_visual_obs_encoder".format(i),
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],

"stream_{}_visual_obs_encoder".format(i),
reuse_encoder
reuse_encoder,
)
visual_encoders.append(encoded_visual)

ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
reuse_encoder
reuse_encoder,
)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)

if var_latent:
with tf.variable_scope("encoding/latent"):
encoded_state_dist = GaussianEncoderDistribution(
encoded_state,
self.feature_size,
encoded_state, self.feature_size
with tf.variable_scope(next_encoder_scope+"/latent"):
with tf.variable_scope(next_encoder_scope + "/latent"):
encoded_next_state,
self.feature_size,
reuse=reuse_encoder
encoded_next_state, self.feature_size, reuse=reuse_encoder
return encoded_state, encoded_next_state, encoded_state_dist, encoded_next_state_dist
return (
encoded_state,
encoded_next_state,
encoded_state_dist,
encoded_next_state_dist,
)
encoded_state,
self.feature_size,
name="latent"
)
encoded_state, self.feature_size, name="latent"
)
encoded_next_state,
self.feature_size,
name="latent",
reuse=reuse_encoder
)
encoded_next_state,
self.feature_size,
name="latent",
reuse=reuse_encoder,
)
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, inverse_layers: int
self,
encoded_state: tf.Tensor,
encoded_next_state: tf.Tensor,
inverse_layers: int,
) -> None:
"""
Creates inverse model TensorFlow ops for Curiosity module.

combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
hidden = combined_input
for i in range(inverse_layers-1):
for i in range(inverse_layers - 1):
hidden = tf.layers.dense(
hidden,
self.h_size,

pred_action = tf.concat(
[
tf.layers.dense(
hidden, self.act_size[i], activation=tf.nn.softmax, name="pred_action"
hidden,
self.act_size[i],
activation=tf.nn.softmax,
name="pred_action",
)
for i in range(len(self.act_size))
],

)
def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int,
var_predict: bool=False, separate_train: bool=False
self,
encoded_state: tf.Tensor,
encoded_next_state: tf.Tensor,
forward_layers: int,
var_predict: bool = False,
separate_train: bool = False,
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.

"""
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1
)
combined_input = tf.concat([encoded_state, self.current_action], axis=1)
hidden = combined_input
if separate_train:
hidden = tf.stop_gradient(hidden)

if var_predict:
self.predict_distribution = GaussianEncoderDistribution(
hidden,
self.feature_size
hidden, self.feature_size
)
self.predict = self.predict_distribution.sample()
else:

)
squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(self.predict, encoded_next_state), axis=1
tf.squared_difference(self.predict, tf.stop_gradient(encoded_next_state)),
axis=1,
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
def create_reward_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor,
forward_layers: int, separate_train: bool=False):
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1
)
self.forward_loss = tf.reduce_mean(squared_difference)
# tf.dynamic_partition(squared_difference, self.mask, 2)[1]
# )
def create_reward_model(
self,
encoded_state: tf.Tensor,
encoded_next_state: tf.Tensor,
forward_layers: int,
separate_train: bool = False,
):
combined_input = tf.concat([encoded_state, self.current_action], axis=1)
hidden = combined_input
if separate_train:
hidden = tf.stop_gradient(hidden)

self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),

# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.reward_loss = tf.clip_by_value(tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)
), 1e-10,1.0)
self.reward_loss = tf.clip_by_value(
tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)
),
1e-10,
1.0,
)
self,
self,
h_size: int,
feature_size: int,
encoder_layers: int,

predict_return: bool
predict_return: bool,
) -> None:
with tf.variable_scope("encoding"):
self.visual_bisim = ModelUtils.create_visual_input_placeholders(

if self.normalize:
bi_normalization_tensors = self.create_target_normalizer(self.vector_bisim)
bi_normalization_tensors = self.create_target_normalizer(
self.vector_bisim
)
self.bi_update_normalization_op = bi_normalization_tensors.update_op
self.bi_normalization_steps = bi_normalization_tensors.steps
self.bi_running_mean = bi_normalization_tensors.running_mean

h_size,
encoder_layers,
vis_encode_type,
reuse=True
reuse=True,
hidden_stream,
feature_size,
name="latent",
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
reuse=True
)
hidden_stream,
feature_size,
name="latent",
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
reuse=True,
)
combined_input = tf.concat(
[self.bisim_encoder, self.bisim_action], axis=1
)
combined_input = tf.concat([self.bisim_encoder, self.bisim_action], axis=1)
combined_input = tf.stop_gradient(combined_input)
with tf.variable_scope("predict"):

if var_predict:
self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden,
self.feature_size,
reuse=True
hidden, self.feature_size, reuse=True
)
self.bisim_predict = self.predict_distribution.sample()
else:

for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True
# activation=ModelUtils.swish,

reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
)

258
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import TrainerSettings, PPOSettings, PPOTransferSettings
class PPOTransferOptimizer(TFOptimizer):
def __init__(self, policy: TransferPolicy, trainer_params: TrainerSettings):

# Transfer
self.use_transfer = hyperparameters.use_transfer
self.transfer_path = hyperparameters.transfer_path #"results/BallSingle_nosep_cmodel_small/3DBall"
self.transfer_path = (
hyperparameters.transfer_path
) # "results/BallSingle_nosep_cmodel_small/3DBall"
self.smart_transfer = hyperparameters.smart_transfer
self.conv_thres = hyperparameters.conv_thres
self.transfer_type = hyperparameters.transfer_type

self.bisim_update_dict: Dict[str, tf.Tensor] = {}
# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph(hyperparameters.encoder_layers, hyperparameters.policy_layers,
hyperparameters.forward_layers, hyperparameters.inverse_layers, hyperparameters.feature_size,
self.use_transfer, self.separate_policy_train, self.use_var_encoder, self.use_var_predict,
self.predict_return, self.use_inverse_model, self.reuse_encoder, self.use_bisim)
policy.create_tf_graph(
hyperparameters.encoder_layers,
hyperparameters.policy_layers,
hyperparameters.forward_layers,
hyperparameters.inverse_layers,
hyperparameters.feature_size,
self.use_transfer,
self.separate_policy_train,
self.use_var_encoder,
self.use_var_predict,
self.predict_return,
self.use_inverse_model,
self.reuse_encoder,
self.use_bisim,
)
with policy.graph.as_default():
super().__init__(policy, trainer_params)

num_layers = policy_network_settings.num_layers
vis_encode_type = policy_network_settings.vis_encode_type
self.burn_in_ratio = 0.0
self.num_updates = 0
self.alter_every = 400
self.copy_every = 1

"Policy/Beta": "decay_beta",
}
if self.predict_return:
self.stats_name_to_update_name.update({
"Losses/Reward Loss": "reward_loss",
})
self.stats_name_to_update_name.update(
{"Losses/Reward Loss": "reward_loss"}
)
# if self.use_bisim:
# self.stats_name_to_update_name.update({
# "Losses/Bisim Loss": "bisim_loss",

with tf.variable_scope("value"):
if policy.use_continuous_act:
if hyperparameters.separate_value_net:
self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_cc_critic_old(
h_size, hyperparameters.value_layers, vis_encode_type
)
self._create_cc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_cc_critic(
h_size, hyperparameters.value_layers, vis_encode_type
)
self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_dc_critic_old(
h_size, hyperparameters.value_layers, vis_encode_type
)
self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_dc_critic(
h_size, hyperparameters.value_layers, vis_encode_type
)
with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(
self._schedule,

)
self.bisim_learning_rate = ModelUtils.create_schedule(
ScheduleType.CONSTANT,
lr/10,
lr / 10,
self.policy.global_step,
int(max_step),
min_value=1e-10,

"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
"model_learning_rate": self.model_learning_rate
"model_learning_rate": self.model_learning_rate,
self.update_dict.update(
{
"reward_loss": self.policy.reward_loss,
}
)
self.update_dict.update({"reward_loss": self.policy.reward_loss})
if self.use_alter or self.smart_transfer or self.in_batch_alter or self.in_epoch_alter or self.op_buffer:
if (
self.use_alter
or self.smart_transfer
or self.in_batch_alter
or self.in_epoch_alter
or self.op_buffer
):
self.policy.load_graph_partial(self.transfer_path, self.transfer_type,
hyperparameters.load_model, hyperparameters.load_policy, hyperparameters.load_value)
self.policy.load_graph_partial(
self.transfer_path,
self.transfer_type,
hyperparameters.load_model,
hyperparameters.load_policy,
hyperparameters.load_value,
)
def _create_cc_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:

ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
reuse=False,
)
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value

ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
reuse=False,
)
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value

)
def _create_losses(
self, probs, old_probs, value_heads, entropy, targ_encoder, predict,
beta, epsilon, lr, max_step
self,
probs,
old_probs,
value_heads,
entropy,
targ_encoder,
predict,
beta,
epsilon,
lr,
max_step,
):
"""
Creates training-specific Tensorflow ops for PPO models.

# )
# target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
# if self.predict_return:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# if self.with_prior:
# if self.use_var_encoder:
# self.model_loss += encoder_distribution.kl_standard()

if self.use_inverse_model:
self.model_loss += 0.5 * self.policy.inverse_loss
predict_diff = self.policy.predict_distribution.w_distance(self.policy.bisim_predict_distribution)
predict_diff = self.policy.predict_distribution.w_distance(
self.policy.bisim_predict_distribution
)
tf.squared_difference(self.policy.bisim_predict, self.policy.predict)
tf.squared_difference(
self.policy.bisim_predict, self.policy.predict
)
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
tf.squared_difference(
self.policy.bisim_pred_reward, self.policy.pred_reward
)
predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + tf.abs(reward_diff)
predict_diff = self.reward_signals[
"extrinsic"
].gamma * predict_diff + tf.abs(reward_diff)
encode_dist = tf.reduce_mean(
tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)

def _create_ppo_optimizer_ops(self):
train_vars = []
if self.train_encoder:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")

self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)
self.update_batch = self.tf_optimizer.minimize(self.loss, var_list=train_vars)
bisim_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
bisim_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
self.bisim_grads = self.tf_optimizer.compute_gradients(self.bisim_loss, var_list=bisim_train_vars)
self.bisim_update_batch = self.tf_optimizer.minimize(self.bisim_loss, var_list=bisim_train_vars)
self.bisim_grads = self.tf_optimizer.compute_gradients(
self.bisim_loss, var_list=bisim_train_vars
)
self.bisim_update_batch = self.tf_optimizer.minimize(
self.bisim_loss, var_list=bisim_train_vars
)
{
"bisim_loss": self.bisim_loss,
"update_batch": self.bisim_update_batch,
"bisim_learning_rate": self.bisim_learning_rate,
}
)
{
"bisim_loss": self.bisim_loss,
"update_batch": self.bisim_update_batch,
"bisim_learning_rate": self.bisim_learning_rate,
}
)
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
policy_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "policy"
) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.ppo_grads = self.ppo_optimizer.compute_gradients(
self.ppo_loss, var_list=train_vars
)
self.ppo_update_batch = self.ppo_optimizer.minimize(
self.ppo_loss, var_list=train_vars
)
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
model_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "predict"
) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.model_grads = self.model_optimizer.compute_gradients(
self.model_loss, var_list=train_vars
)
self.model_update_batch = self.model_optimizer.minimize(
self.model_loss, var_list=train_vars
)
self.model_only_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=model_train_vars)
self.model_only_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=model_train_vars)
self.model_only_grads = self.model_optimizer.compute_gradients(
self.model_loss, var_list=model_train_vars
)
self.model_only_update_batch = self.model_optimizer.minimize(
self.model_loss, var_list=model_train_vars
)
self.ppo_update_dict.update(
{

"decay_beta": self.decay_beta,
}
)
self.model_update_dict.update(
{
"model_loss": self.model_loss,

)
if self.predict_return:
self.ppo_update_dict.update({
"reward_loss": self.policy.reward_loss,
})
self.ppo_update_dict.update({"reward_loss": self.policy.reward_loss})
self.model_update_dict.update({
"reward_loss": self.policy.reward_loss,
})
self.model_update_dict.update({"reward_loss": self.policy.reward_loss})
self.model_only_update_dict.update({
"reward_loss": self.policy.reward_loss,
})
self.model_only_update_dict.update({"reward_loss": self.policy.reward_loss})
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:

reward_signal.prepare_update(self.policy, batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
if self.use_alter:
# if self.num_updates / self.alter_every == 0:
# update_vals = self._execute_model(feed_dict, self.update_dict)

update_vals = self._execute_model(feed_dict, self.model_update_dict)
if self.num_updates % self.alter_every == 0:
print("start update model", self.num_updates)
else: # (self.num_updates / self.alter_every) % 2 == 0:
else: # (self.num_updates / self.alter_every) % 2 == 0:
stats_needed = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",

update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
if self.num_updates % self.alter_every == 0:
print("start update policy", self.num_updates)
elif self.in_batch_alter:
update_vals = self._execute_model(feed_dict, self.model_update_dict)
update_vals.update(self._execute_model(feed_dict, self.ppo_update_dict))

for stat_name, update_name in stats_needed.items():
# if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
def update_part(self, batch: AgentBuffer, num_sequences: int, update_type: str="policy") -> Dict[str, float]:
def update_part(
self, batch: AgentBuffer, num_sequences: int, update_type: str = "policy"
) -> Dict[str, float]:
"""
Performs update on model.
:param mini_batch: Batch of experiences.

feed_dict = self._construct_feed_dict(batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():

stats_needed.update(reward_signal.stats_name_to_update_name)
if update_type == "model":
update_vals = self._execute_model(feed_dict, self.model_update_dict)
elif update_type == "policy":

return update_stats
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer):
stats_needed = {
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim_learning_rate",

selected_action_1 = self.policy.sess.run(self.policy.selected_actions, feed_dict = {
self.policy.vector_in: mini_batch1["vector_obs"],
})
selected_action_1 = self.policy.sess.run(
self.policy.selected_actions,
feed_dict={self.policy.vector_in: mini_batch1["vector_obs"]},
)
selected_action_2 = self.policy.sess.run(self.policy.selected_actions, feed_dict = {
self.policy.vector_in: mini_batch2["vector_obs"],
})
selected_action_2 = self.policy.sess.run(
self.policy.selected_actions,
feed_dict={self.policy.vector_in: mini_batch2["vector_obs"]},
)
feed_dict = {
self.policy.vector_in: mini_batch1["vector_obs"],

}
return update_stats
def _construct_feed_dict(

if self.policy.vis_obs_size > 0:
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
feed_dict[self.policy.visual_next[i]] = mini_batch["next_visual_obs%d" % i]
feed_dict[self.policy.visual_next[i]] = mini_batch[
"next_visual_obs%d" % i
]
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = [
mini_batch["memory"][i]

)
# print(self.policy.sess.run(self.policy.encoder, feed_dict={self.policy.vector_in: mini_batch["vector_obs"]}))
return feed_dict
def _create_cc_critic_old(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType

axis=1,
keepdims=True,
)

176
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"],
agent_buffer_trajectory["next_vector_in"], agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(
agent_buffer_trajectory["vector_obs"],
agent_buffer_trajectory["next_vector_in"],
agent_buffer_trajectory["vector_obs"],
)
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy_old(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
if self.train_model and self.use_op_buffer:
self._update_model()
# if self.update_buffer.num_experiences < self.hyperparameters.buffer_size:
# return True
buffer_length = self.update_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
if self.use_iealter:
# if self.train_model:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# else:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer = self.update_buffer
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_part(
# buffer.make_mini_batch(i, i + batch_size), n_sequences, "model_only"
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# if self.use_bisim:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer1 = copy.deepcopy(self.update_buffer)
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer2 = copy.deepcopy(self.update_buffer)
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_encoder(
# buffer1.make_mini_batch(i, i + batch_size),
# buffer2.make_mini_batch(i, i + batch_size),
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
self.num_update += 1
return True
def _update_model(self):
"""
Uses demonstration_buffer to update the policy.

for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
self.off_policy_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
self.off_policy_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
# if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
# self.train_model = False
# else:

update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
if self.off_policy_buffer.num_experiences > 4 * self.hyperparameters.buffer_size:
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
self.off_policy_buffer.truncate(
int(2 * self.hyperparameters.buffer_size)
)
print("truncate")
# self.train_model = False
# if self.off_policy_buffer.num_experiences > 4 * self.hyperparameters.buffer_size:
# self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
# self.off_policy_buffer.truncate(
# int(2 * self.hyperparameters.buffer_size)
# )
# print("truncate")
# # self.train_model = False
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# for _ in range(num_epoch):
# for _ in range(num_epoch):
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
max_num_batch = (
update_buffer_length // batch_size
) # update with as much data as the policy has
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"

if self.use_bisim:
# for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# for _ in range(num_epoch):
self.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
self.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
max_num_batch = (
update_buffer_length // batch_size
) # update with as much data as the policy has
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
if (
self.off_policy_buffer.num_experiences
> 10 * self.hyperparameters.buffer_size
):
self.off_policy_buffer.truncate(
int(5 * self.hyperparameters.buffer_size)
)
self.off_policy_buffer.truncate(int(5 * self.hyperparameters.buffer_size))
return True
def create_policy(

153
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary, TensorboardWriter, CSVWriter
from mlagents.trainers.stats import (
StatsReporter,
StatsWriter,
StatsSummary,
TensorboardWriter,
CSVWriter,
)
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,

)
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list pf all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.

print(step, val, stats_summary.mean)
self.stats[step] = stats_summary.mean
self._last_reward_summary[category] = stats_summary.mean
def write2file(self, filename):
with open(filename, "w") as reward_file:
for step in self.stats.keys():

success_threshold=0.9,
env_manager=None,
run_id="id",
seed=1337
seed=1337,
):
# Create controller and begin training.
model_dir = "./transfer_results/" + run_id

csv_writer = CSVWriter(
model_dir,
required_fields=[
"Environment/Cumulative Reward",
"Environment/Episode Length",
],
required_fields=["Environment/Cumulative Reward", "Environment/Episode Length"],
tb_writer = TensorboardWriter(
model_dir, clear_past_data=True
)
tb_writer = TensorboardWriter(model_dir, clear_past_data=True)
StatsReporter.add_writer(tb_writer)
StatsReporter.add_writer(csv_writer)

# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1", seed=0):
def test_2d_model(
config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1", seed=0
):
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"
[BRAIN_NAME],
use_discrete=False,
action_size=2,
step_size=0.1,
num_vector=2,
obs_spec_type=obs_spec_type,
goal_type="hard",
config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3,
use_bisim=True, predict_return=True,
# separate_value_train=True, separate_policy_train=True,
use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=False, in_batch_alter=True,
policy_layers=2, value_layers=2, encoder_layers=0, feature_size=2,
#use_inverse_model=True
config.hyperparameters,
batch_size=1200,
buffer_size=12000,
learning_rate=5.0e-3,
use_bisim=True,
predict_return=True,
reuse_encoder=True,
separate_value_train=True,
separate_policy_train=False,
use_var_predict=True,
with_prior=False,
use_op_buffer=False,
in_epoch_alter=False,
in_batch_alter=True,
policy_layers=0,
value_layers=2,
forward_layers=2,
encoder_layers=2,
feature_size=16,
# use_inverse_model=True
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
config = attr.evolve(
config, hyperparameters=new_hyperparams, max_steps=500000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed
)
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich1",
def test_2d_transfer(
config=Transfer_CONFIG,
obs_spec_type="rich1",
run_id="transfer_f4_rich1_from-rich2-retrain-pv_rew_bisim-op", seed=1337):
run_id="transfer_f4_rich1_from-rich2-retrain-pv_rew_bisim-op",
seed=1337,
):
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"
[BRAIN_NAME],
use_discrete=False,
action_size=2,
step_size=0.1,
num_vector=2,
obs_spec_type=obs_spec_type,
goal_type="hard",
config.hyperparameters, batch_size=120, buffer_size=12000, use_transfer=True,
transfer_path=transfer_from, #separate_policy_train=True, separate_value_train=True,
use_op_buffer=False, in_epoch_alter=False, in_batch_alter=True, learning_rate=5.0e-3,
train_policy=True, train_value=True, train_model=False, feature_size=2,
use_var_predict=True, with_prior=True, policy_layers=2, load_policy=False,
load_value=False, predict_return=True, value_layers=2, encoder_layers=1,
use_bisim=False,
config.hyperparameters,
batch_size=1200,
buffer_size=12000,
use_transfer=True,
transfer_path=transfer_from, # separate_policy_train=True, separate_value_train=True,
use_op_buffer=False,
in_epoch_alter=False,
in_batch_alter=True,
learning_rate=5.0e-3,
train_policy=True,
train_value=True,
train_model=False,
feature_size=16,
use_var_predict=True,
with_prior=False,
policy_layers=0,
load_policy=False,
load_value=False,
predict_return=True,
forward_layers=2,
value_layers=2,
encoder_layers=2,
use_bisim=True,
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=300000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
config = attr.evolve(
config, hyperparameters=new_hyperparams, max_steps=500000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed
)
# for obs in ["normal"]: # ["normal", "rich1", "rich2"]:
# test_2d_model(seed=0, obs_spec_type=obs, run_id="model_" + obs \
# + "_f2_pv-l2_linear-rew_ibalter_conlr_enc-l0-op4_bisim_suf1")
for obs in ["normal"]: # ["normal", "rich1", "rich2"]:
test_2d_model(seed=0, obs_spec_type=obs, run_id="model_" + obs)
test_2d_transfer(seed=0, obs_spec_type="rich1",
transfer_from="./transfer_results/model_"+ obs +"_f2_pv-l2_linear-rew_ibalter_conlr_enc-l0-op4_s0/Simple",
run_id="transfer_rich1_f2_pv-l2_ibalter_suf1_nobisim_from_" + obs)
# for obs in ["normal"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-nop_newalter_noreuse-soft0.1_s0/Simple",
# run_id="transfer_rich1_retrain-all_f4_pv-l0_rew_bisim-nop_noreuse-soft0.1_from_" + obs)
# for i in range(5):
# test_2d_model(seed=i)
test_2d_transfer(
seed=0,
obs_spec_type="rich1",
transfer_from="./transfer_results/model_" + obs + "_s0/Simple",
run_id="transfer_rich1",
)
# for obs in ["normal"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-nop_newalter_noreuse-soft0.1_s0/Simple",
# run_id="transfer_rich1_retrain-all_f4_pv-l0_rew_bisim-nop_noreuse-soft0.1_from_" + obs)
# for i in range(5):
# test_2d_model(seed=i)

54
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
action_size=1,
obs_spec_type="normal", # normal: (x,y); rich: (x+y, x-y, x*y)
goal_type="hard", # easy: 1 or -1; hard: uniformly random
obs_spec_type="normal", # normal: (x,y); rich: (x+y, x-y, x*y)
goal_type="hard", # easy: 1 or -1; hard: uniformly random
):
super().__init__()
self.discrete = use_discrete

elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.uniform(-1,1))
self.goal[name].append(self.random.uniform(-1, 1))
self.rewards[name] = 0
self.final_rewards[name] = []
self._reset_agent(name)

obs_spec.append((self.vec_obs_size,))
# composed position
if "rich" in self.obs_spec_type:
for _ in range(self.num_vector+1):
for _ in range(self.num_vector + 1):
obs_spec.append((self.vec_obs_size,))
print("obs_spec:", obs_spec)
return obs_spec

for name in self.names:
i = self.positions[name][0]
j = self.positions[name][1]
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i+j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i-j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i + j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i - j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i * j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (2*i+j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (2*i-j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i * j))
obs.append(
np.ones((1, self.vec_obs_size), dtype=np.float32) * (2 * i + j)
)
obs.append(
np.ones((1, self.vec_obs_size), dtype=np.float32) * (2 * i - j)
)
for _ in range(self.num_visual):
obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs

# Both must be in 1.0 to be done
# print(self.positions[name], end="")
if self.goal_type == "easy":
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) or self.step_count[name] >= self.horizon[name]
done = (
all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
or self.step_count[name] >= self.horizon[name]
)
done = self.step_count[name] >= self.horizon[name]
# done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
# or self.step_count[name] >= self.horizon[name]
# done = self.step_count[name] >= self.horizon[name]
done = (
all(
abs(pos - goal) <= 0.1
for pos, goal in zip(self.positions[name], self.goal[name])
)
or self.step_count[name] >= self.horizon[name]
)
# if done:
# print(self.positions[name], end=" done ")
return done

return action_mask
def _compute_reward(self, name: str, done: bool) -> float:
reward = 0.0
for _pos, goal in zip(self.positions[name], self.goal[name]):
reward = -TIME_PENALTY
# for _pos, goal in zip(self.positions[name], self.goal[name]):
reward += 2 - abs(_pos - goal) #np.exp(-abs(_pos - goal))
# reward += 2 - abs(_pos - goal) #np.exp(-abs(_pos - goal))
# reward = SUCCESS_REWARD
# reward = 0#SUCCESS_REWARD
# # for _pos in self.positions[name]:
# # if self.goal_type == "easy":
# # reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(

elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.uniform(-1,1))
self.positions[name] = [self.random.uniform(-1,1) for _ in range(self.action_size)]
self.goal[name].append(self.random.uniform(-1, 1))
self.positions[name] = [
self.random.uniform(-1, 1) for _ in range(self.action_size)
]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1

正在加载...
取消
保存