浏览代码

Develop bisim action encoder, incorporate related hyperparameter settings (#4253)

/develop/bisim-review
GitHub 5 年前
当前提交
9f041970
共有 5 个文件被更改,包括 428 次插入161 次删除
  1. 182
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 33
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 13
      ml-agents/mlagents/trainers/settings.py
  4. 229
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  5. 132
      ml-agents/mlagents/trainers/tests/reward_plot.ipynb

182
ml-agents/mlagents/trainers/policy/transfer_policy.py


def create_tf_graph(
self,
encoder_layers=1,
action_layers=1,
action_feature_size=16,
transfer=False,
separate_train=False,
var_encoder=False,

self.inverse_model = inverse_model
self.reuse_encoder = reuse_encoder
self.feature_size = feature_size
self.action_feature_size = action_feature_size
self.predict_return = predict_return
self.use_bisim = use_bisim
self.transfer = transfer

self.next_visual_in: List[tf.Tensor] = []
if var_encoder:
self.encoder_distribution, self.encoder = self._create_var_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
)
self.encoder = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
)
_, self.targ_encoder = self._create_var_target_encoder(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
reuse_encoder,
)
else:
self.encoder = self._create_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
)
self.targ_encoder = self._create_target_encoder(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
reuse_encoder,
)
self.targ_encoder = self._create_target_encoder(
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type,
reuse_encoder,
)
self.action_encoder = self._create_action_encoder(
self.current_action,
self.h_size,
self.action_feature_size,
action_layers
)
if self.inverse_model:
with tf.variable_scope("inverse"):
self.create_inverse_model(
self.encoder, self.targ_encoder, inverse_layers
)
# if self.inverse_model:
# with tf.variable_scope("inverse"):
# self.create_inverse_model(
# self.encoder, self.targ_encoder, inverse_layers
# )
self.action_encoder,
self.targ_encoder,
forward_layers,
var_predict=var_predict,

with tf.variable_scope("reward"):
self.create_reward_model(
self.encoder, self.targ_encoder, forward_layers
self.encoder, self.action_encoder, forward_layers
)
if self.use_bisim:

def load_graph_partial(
self,
path: str,
transfer_type="dynamics",
load_model=True,
load_policy=True,
load_value=True,
load_model=False,
load_policy=False,
load_value=False,
load_encoder=False,
load_action=False
load_nets = {"dynamics": [], "observation": ["encoding", "inverse"]}
load_nets = []
load_nets["dynamics"].append("predict")
load_nets.append("predict")
load_nets["dynamics"].append("reward")
load_nets.append("reward")
load_nets["dynamics"].append("policy")
load_nets.append("policy")
load_nets["dynamics"].append("value")
if self.inverse_model:
load_nets["dynamics"].append("inverse")
load_nets.append("value")
if load_encoder:
load_nets.append("encoding")
if load_action:
load_nets.append("action_enc")
# if self.inverse_model:
# load_nets.append("inverse")
for net in load_nets[transfer_type]:
for net in load_nets:
variables_to_restore = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, net
)

print("loaded net", net, "from path", path)
if transfer_type == "observation":
self.run_hard_copy()
# if load_encoder:
# self.run_hard_copy()
def _create_world_model(
self,

)
return latent
def _create_action_encoder(
self,
action: tf.Tensor,
h_size: int,
action_feature_size: int,
num_layers: int,
) -> tf.Tensor:
hidden_stream = ModelUtils.create_vector_observation_encoder(
action,
h_size,
ModelUtils.swish,
num_layers,
scope="action_enc",
reuse=False
)
with tf.variable_scope("action_enc"):
latent = tf.layers.dense(
hidden_stream,
action_feature_size,
name="latent",
activation=tf.tanh,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent
def _create_hard_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")

def run_hard_copy(self):
self.sess.run(self.target_replace_op)
# def _create_inverse_model(
# self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
# ) -> None:
# """
# Creates inverse model TensorFlow ops for Curiosity module.
# Predicts action taken given current and future encoded states.
# :param encoded_state: Tensor corresponding to encoded current state.
# :param encoded_next_state: Tensor corresponding to encoded next state.
# """
# with tf.variable_scope("inverse"):
# combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
# hidden = tf.layers.dense(
# combined_input, self.h_size, activation=ModelUtils.swish
# )
# if self.brain.vector_action_space_type == "continuous":
# pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None)
# squared_difference = tf.reduce_sum(
# tf.squared_difference(pred_action, self.current_action), axis=1
# )
# self.inverse_loss = tf.reduce_mean(
# tf.dynamic_partition(squared_difference, self.mask, 2)[1]
# )
# else:
# pred_action = tf.concat(
# [
# tf.layers.dense(
# hidden, self.act_size[i], activation=tf.nn.softmax
# )
# for i in range(len(self.act_size))
# ],
# axis=1,
# )
# cross_entropy = tf.reduce_sum(
# -tf.log(pred_action + 1e-10) * self.current_action, axis=1
# )
# self.inverse_loss = tf.reduce_mean(
# tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
# )
def _create_cc_actor(
self,
encoded: tf.Tensor,

encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
encoding_saver.save(self.sess, encoding_checkpoint)
action_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc"
)
action_saver = tf.train.Saver(action_vars)
action_checkpoint = os.path.join(self.model_path, f"action_enc.ckpt")
action_saver.save(self.sess, action_checkpoint)
latent_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent"
)

def create_forward_model(
self,
encoded_state: tf.Tensor,
encoded_action: tf.Tensor,
encoded_next_state: tf.Tensor,
forward_layers: int,
var_predict: bool = False,

:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat([encoded_state, self.current_action], axis=1)
combined_input = tf.concat([encoded_state, encoded_action], axis=1)
hidden = combined_input
# if self.transfer:
# hidden = tf.stop_gradient(hidden)

def create_reward_model(
self,
encoded_state: tf.Tensor,
encoded_next_state: tf.Tensor,
encoded_action: tf.Tensor,
combined_input = tf.concat([encoded_state, self.current_action], axis=1)
combined_input = tf.concat([encoded_state, encoded_action], axis=1)
hidden = combined_input
# if self.transfer:

33
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.in_epoch_alter = hyperparameters.in_epoch_alter
self.op_buffer = hyperparameters.use_op_buffer
self.train_encoder = hyperparameters.train_encoder
self.train_action = hyperparameters.train_action
) # "results/BallSingle_nosep_cmodel_small/3DBall"
)
self.transfer_type = hyperparameters.transfer_type
self.ppo_update_dict: Dict[str, tf.Tensor] = {}
self.model_update_dict: Dict[str, tf.Tensor] = {}

# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph(
hyperparameters.encoder_layers,
hyperparameters.action_layers,
hyperparameters.action_feature_size,
self.use_transfer,
self.separate_policy_train,
self.use_var_encoder,

min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
# ScheduleType.CONSTANT,
hyperparameters.model_schedule,
lr,
self.policy.global_step,
int(max_step),

ScheduleType.CONSTANT,
hyperparameters.model_schedule,
lr / 10,
self.policy.global_step,
int(max_step),

if self.use_transfer:
self.policy.load_graph_partial(
self.transfer_path,
self.transfer_type,
hyperparameters.load_encoder,
hyperparameters.load_action,
)
# self.policy.get_encoder_weights()
# self.policy.get_policy_weights()

def _create_ppo_optimizer_ops(self):
train_vars = []
if self.train_encoder:
train_vars += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.train_action:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")

train_vars += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
if self.train_action:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
policy_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "policy"
) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(
self.ppo_loss, var_list=train_vars

)
model_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "predict"
) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(
self.model_loss, var_list=train_vars

)
model_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "predict"
) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
self.model_only_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_only_grads = self.model_optimizer.compute_gradients(
self.model_loss, var_list=model_train_vars

13
ml-agents/mlagents/trainers/settings.py


lambd: float = 0.95
num_epoch: int = 3
learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
model_schedule: ScheduleType = ScheduleType.LINEAR
separate_value_train: bool = False
separate_policy_train: bool = False

in_epoch_alter: bool = False
use_op_buffer: bool = False
train_encoder: bool = True
train_action: bool = True
feature_size: int = 16
use_bisim: bool = False
# Transfer

transfer_path: str = ""
transfer_type: str = "dynamics"
load_value: bool = True
load_policy: bool = True
load_value: bool = False
load_policy: bool = False
load_encoder: bool = False
load_action: bool = False
action_layers: int = 1
feature_size: int = 16
action_feature_size: int = 16
@attr.s(auto_attribs=True)

229
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

132
ml-agents/mlagents/trainers/tests/reward_plot.ipynb
文件差异内容过多而无法显示
查看文件

正在加载...
取消
保存