浏览代码

separate policy net

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
2e927257
共有 7 个文件被更改,包括 84 次插入214 次删除
  1. 2
      config/sac_transfer/3DBall.yaml
  2. 8
      config/sac_transfer/CrawlerStatic.yaml
  3. 9
      config/sac_transfer/CrawlerStaticTransfer.yaml
  4. 210
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  5. 32
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  6. 36
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  7. 1
      ml-agents/mlagents/trainers/settings.py

2
config/sac_transfer/3DBall.yaml


value_layers: 1
action_layers: 1
feature_size: 64
action_feature_size: 16
action_feature_size: 32
# separate_value_net: true
separate_policy_train: true
# separate_value_train: true

8
config/sac_transfer/CrawlerStatic.yaml


init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
encoder_layers: 2
policy_layers: 1
forward_layers: 0
policy_layers: 3
forward_layers: 2
feature_size: 32
feature_size: 128
# separate_policy_net: true
separate_policy_net: true
separate_model_train: true
# separate_value_net: true
reuse_encoder: true

9
config/sac_transfer/CrawlerStaticTransfer.yaml


init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
encoder_layers: 2
policy_layers: 1
forward_layers: 0
policy_layers: 3
forward_layers: 2
feature_size: 32
feature_size: 128
separate_policy_net: true
# separate_model_train: true
# separate_value_net: true
reuse_encoder: true

train_model: false
load_action: true
train_action: false
transfer_path: "results/oldcs-f32-a128/CrawlerStatic"
transfer_path: "results/oldcs-sep-p/CrawlerStatic"
network_settings:
normalize: true
hidden_units: 512

210
ml-agents/mlagents/trainers/policy/transfer_policy.py


action_feature_size=16,
transfer=False,
separate_train=False,
separate_policy_net=False,
separate_model_train=False,
var_encoder=False,
var_predict=True,

self._create_hard_copy()
self._create_soft_copy()
# self.encoder = self._create_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type,
# )
# self.targ_encoder = self._create_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type,
# reuse_encoder,
# )
self.action_encoder = self._create_action_encoder(
self.current_action,
self.h_size,

self.reparameterize,
self.condition_sigma_on_obs,
separate_train,
separate_policy_net
)
else:
self._create_dc_actor(

load_nets.append("encoding")
if load_action:
load_nets.append("action_enc")
# if self.inverse_model:
# load_nets.append("inverse")
if self.inverse_model:
load_nets.append("inverse")
with self.graph.as_default():
for net in load_nets:

partial_saver.restore(self.sess, partial_model_checkpoint)
print("loaded net", net, "from path", path)
# if load_encoder:
# self.run_hard_copy()
def _create_world_model(
self,
encoder: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
predict_return: bool = False,
) -> tf.Tensor:
""""
Builds the world model for state prediction
"""
with self.graph.as_default():
with tf.variable_scope("predict"):
# self.current_action = tf.placeholder(
# shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
# )
hidden_stream = ModelUtils.create_vector_observation_encoder(
tf.concat([encoder, self.current_action], axis=1),
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False,
)
if predict_return:
predict = tf.layers.dense(
hidden_stream, feature_size + 1, name="next_state"
)
else:
predict = tf.layers.dense(
hidden_stream, feature_size, name="next_state"
)
return predict
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]

run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
# def _create_target_encoder(
# self,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
# reuse_encoder: bool,
# ) -> tf.Tensor:
# if reuse_encoder:
# next_encoder_scope = "encoding"
# else:
# next_encoder_scope = "target_enc"
# self.visual_next = ModelUtils.create_visual_input_placeholders(
# self.brain.camera_resolutions
# )
# self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
# if self.normalize:
# vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
# self.vn_update_normalization_op = vn_normalization_tensors.update_op
# self.vn_normalization_steps = vn_normalization_tensors.steps
# self.vn_running_mean = vn_normalization_tensors.running_mean
# self.vn_running_variance = vn_normalization_tensors.running_variance
# self.processed_vector_next = ModelUtils.normalize_vector_obs(
# self.vector_next,
# self.vn_running_mean,
# self.vn_running_variance,
# self.vn_normalization_steps,
# )
# else:
# self.processed_vector_next = self.vector_next
# self.vp_update_normalization_op = None
# with tf.variable_scope(next_encoder_scope):
# hidden_stream_targ = ModelUtils.create_observation_streams(
# self.visual_next,
# self.processed_vector_next,
# 1,
# h_size,
# num_layers,
# vis_encode_type,
# reuse=reuse_encoder,
# )[0]
# latent_targ = tf.layers.dense(
# hidden_stream_targ,
# feature_size,
# name="latent",
# reuse=reuse_encoder,
# activation=tf.tanh, # ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
# )
# return latent_targ
# return tf.stop_gradient(latent_targ)
# def _create_encoder(
# self,
# visual_in: List[tf.Tensor],
# vector_in: tf.Tensor,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
# ) -> tf.Tensor:
# """
# Creates an encoder for visual and vector observations.
# :param h_size: Size of hidden linear layers.
# :param num_layers: Number of hidden linear layers.
# :param vis_encode_type: Type of visual encoder to use if visual input.
# :return: The hidden layer (tf.Tensor) after the encoder.
# """
# with tf.variable_scope("encoding"):
# hidden_stream = ModelUtils.create_observation_streams(
# visual_in, vector_in, 1, h_size, num_layers, vis_encode_type,
# )[0]
# latent = tf.layers.dense(
# hidden_stream,
# feature_size,
# name="latent",
# activation=tf.tanh, # ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
# )
# return latent
def _create_encoder_general(
self,
visual_in: List[tf.Tensor],

reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
separate_train: bool = False,
separate_net: bool = False
) -> None:
"""
Creates Continuous control actor-critic model.

:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy.
"""
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
with tf.variable_scope("policy"):
if separate_net:
encoded = self._create_encoder_general(
self.visual_in,
self.processed_vector_in,
h_size,
self.feature_size,
num_layers,
self.vis_encode_type,
scope="policy_enc"
)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
if separate_train:
hidden_policy = tf.stop_gradient(hidden_policy)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
with tf.variable_scope("policy"):
hidden_policy = ModelUtils.create_vector_observation_encoder(
hidden_policy,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False,
)
# hidden_policy = ModelUtils.create_vector_observation_encoder(
# self.processed_vector_in,
# h_size,
# ModelUtils.swish,
# num_layers,
# scope=f"main_graph",
# reuse=False,
# )
if not separate_net:
if separate_train:
hidden_policy = tf.stop_gradient(hidden_policy)
hidden_policy = ModelUtils.create_vector_observation_encoder(
hidden_policy,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False,
)
distribution = GaussianDistribution(
hidden_policy,
self.act_size,

32
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph(
hyperparameters.encoder_layers,
hyperparameters.action_layers,
hyperparameters.policy_layers,
hyperparameters.forward_layers,
hyperparameters.inverse_layers,
hyperparameters.feature_size,
hyperparameters.action_feature_size,
self.use_transfer,
self.separate_policy_train,
self.separate_model_train,
self.use_var_encoder,
self.use_var_predict,
self.predict_return,
self.use_inverse_model,
self.reuse_encoder,
self.use_bisim,
encoder_layers=hyperparameters.encoder_layers,
action_layers=hyperparameters.action_layers,
policy_layers=hyperparameters.policy_layers,
forward_layers=hyperparameters.forward_layers,
inverse_layers=hyperparameters.inverse_layers,
feature_size=hyperparameters.feature_size,
action_feature_size=hyperparameters.action_feature_size,
transfer=self.use_transfer,
separate_train=self.separate_policy_train,
separate_model_train=self.separate_model_train,
var_encoder=self.use_var_encoder,
var_predict=self.use_var_predict,
predict_return=self.predict_return,
inverse_model=self.use_inverse_model,
reuse_encoder=self.reuse_encoder,
use_bisim=self.use_bisim,
)
with policy.graph.as_default():

36
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


self.separate_value_train = hyperparameters.separate_value_train
self.separate_policy_train = hyperparameters.separate_policy_train
self.separate_policy_net = hyperparameters.separate_policy_net
self.separate_model_train = hyperparameters.separate_model_train
self.use_var_encoder = hyperparameters.use_var_encoder
self.use_var_predict = hyperparameters.use_var_predict

# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph(
hyperparameters.encoder_layers,
hyperparameters.action_layers,
hyperparameters.policy_layers,
hyperparameters.forward_layers,
hyperparameters.inverse_layers,
hyperparameters.feature_size,
hyperparameters.action_feature_size,
self.use_transfer,
self.separate_policy_train,
self.separate_model_train,
self.use_var_encoder,
self.use_var_predict,
self.predict_return,
self.use_inverse_model,
self.reuse_encoder,
self.use_bisim,
hyperparameters.tau
encoder_layers=hyperparameters.encoder_layers,
action_layers=hyperparameters.action_layers,
policy_layers=hyperparameters.policy_layers,
forward_layers=hyperparameters.forward_layers,
inverse_layers=hyperparameters.inverse_layers,
feature_size=hyperparameters.feature_size,
action_feature_size=hyperparameters.action_feature_size,
transfer=self.use_transfer,
separate_train=self.separate_policy_train,
separate_policy_net=self.separate_policy_net,
separate_model_train=self.separate_model_train,
var_encoder=self.use_var_encoder,
var_predict=self.use_var_predict,
predict_return=self.predict_return,
inverse_model=self.use_inverse_model,
reuse_encoder=self.reuse_encoder,
use_bisim=self.use_bisim,
tau=hyperparameters.tau
)
with policy.graph.as_default():

1
ml-agents/mlagents/trainers/settings.py


separate_policy_train: bool = False
separate_model_train: bool = False
separate_value_net: bool = False
separate_policy_net: bool = False
use_var_encoder: bool = False
use_var_predict: bool = False
with_prior: bool = False

正在加载...
取消
保存