浏览代码

fix crawler config

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
2b67d1a6
共有 7 个文件被更改,包括 106 次插入82 次删除
  1. 2
      config/sac_transfer/3DBall.yaml
  2. 2
      config/sac_transfer/3DBallHard.yaml
  3. 2
      config/sac_transfer/3DBallHardTransfer.yaml
  4. 2
      config/sac_transfer/CrawlerStaticTransfer.yaml
  5. 5
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  6. 105
      ml-agents/mlagents/trainers/sac_transfer/network.py
  7. 70
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py

2
config/sac_transfer/3DBall.yaml


forward_layers: 1
value_layers: 1
feature_size: 16
separate_policy_train: true
separate_value_net: true
reuse_encoder: true
in_epoch_alter: false
in_batch_alter: true

2
config/sac_transfer/3DBallHard.yaml


forward_layers: 1
value_layers: 1
feature_size: 16
separate_policy_train: true
separate_value_net: true
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true

2
config/sac_transfer/3DBallHardTransfer.yaml


forward_layers: 1
value_layers: 1
feature_size: 16
separate_policy_train: true
separate_value_net: true
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: false

2
config/sac_transfer/CrawlerStaticTransfer.yaml


use_transfer: true
load_model: true
train_model: false
transfer_path: "results/cs-sacmod-old/3DBall"
transfer_path: "results/cs-sacmod-old/CrawlerStatic"
network_settings:
normalize: true
hidden_units: 512

5
ml-agents/mlagents/trainers/policy/transfer_policy.py


variables_to_restore = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, net
)
if net == "value" and len(variables_to_restore) == 0:
variables_to_restore = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "critic"
)
net = "critic"
partial_saver = tf.train.Saver(variables_to_restore)
partial_model_checkpoint = os.path.join(path, f"{net}.ckpt")
partial_saver.restore(self.sess, partial_model_checkpoint)

105
ml-agents/mlagents/trainers/sac_transfer/network.py


return q1_heads, q2_heads, q1, q2
def _create_encoder(
self,
visual_in,
vector_in,
vis_encode_type,
encoder_layers,
scope,
reuse=False
):
"""
Creates the observation inputs, and a CNN if needed,
:param vis_encode_type: Type of CNN encoder.
:param share_ac_cnn: Whether or not to share the actor and critic CNNs.
:return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
once and thrown away.
"""
hidden = self.policy._create_encoder_general(
visual_in,
vector_in,
self.h_size,
self.policy.feature_size,
encoder_layers,
vis_encode_type,
scope=scope,
reuse=reuse
)
return hidden
class SACTransferTargetNetwork(SACTransferNetwork):
"""

h_size=128,
normalize=False,
use_recurrent=False,
encoder_layers=0,
num_layers=2,
stream_names=None,
vis_encode_type=EncoderType.SIMPLE,

shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
)
self.value_memory_in = self.memory_in
hidden_critic = self._create_observation_in(
hidden_critic = self._create_encoder(
vis_encode_type
vis_encode_type,
encoder_layers=encoder_layers,
scope="target_enc",
reuse=True
if separate_train:
hidden_critic = tf.stop_gradient(hidden_critic)
# self._create_cc_critic(self.policy.targ_encoder, self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
# self._create_cc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
# self._create_dc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
# self.critic_vars += self.get_vars("target_enc")
# self.value_vars += self.get_vars("target_enc")
def copy_normalization(self, mean, variance, steps):
"""

update_norm_step = tf.assign(self.normalization_steps, steps)
return tf.group([update_mean, update_variance, update_norm_step])
def _create_observation_in(self, visual_in, vector_in, vis_encode_type):
"""
Creates the observation inputs, and a CNN if needed,
:param vis_encode_type: Type of CNN encoder.
:param share_ac_cnn: Whether or not to share the actor and critic CNNs.
:return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
once and thrown away.
"""
hidden = self.policy._create_encoder_general(
visual_in,
vector_in,
self.h_size,
self.policy.feature_size,
1,
vis_encode_type,
scope="target_enc", #"target_network/critic/value",
reuse=True
)
return hidden
class SACTransferPolicyNetwork(SACTransferNetwork):
"""

h_size=128,
normalize=False,
use_recurrent=False,
encoder_layers=0,
separate_train=False
separate_train=False,
):
super().__init__(
policy,

if self.policy.use_recurrent:
self._create_memory_ins(m_size)
hidden_critic = self._create_observation_in(vis_encode_type)
# self.hidden = hidden_critic
hidden_critic = self._create_encoder(
self.visual_in,
self.processed_vector_in,
vis_encode_type,
encoder_layers=encoder_layers,
scope="encoding",
reuse=True
)
hidden = tf.stop_gradient(self.policy.encoder)
else:
hidden = self.policy.encoder
hidden_critic = tf.stop_gradient(hidden_critic)
# self._create_cc_critic(self.policy.encoder, self.policy.encoder, POLICY_SCOPE)
# self._create_cc_critic(hidden, POLICY_SCOPE)
else:
self._create_dc_critic(hidden_critic, POLICY_SCOPE)
# self._create_dc_critic(hidden, POLICY_SCOPE)

self.q1_memory_in = mem_ins[1]
self.q2_memory_in = mem_ins[2]
def _create_observation_in(self, vis_encode_type):
"""
Creates the observation inputs, and a CNN if needed,
:param vis_encode_type: Type of CNN encoder.
:param share_ac_cnn: Whether or not to share the actor and critic CNNs.
:return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
once and thrown away.
"""
hidden = self.policy._create_encoder_general(
self.policy.visual_in,
self.policy.processed_vector_in,
self.h_size,
self.policy.feature_size,
1,
vis_encode_type,
scope="encoding", #"critic/value",
reuse=True
)
return hidden

70
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac_transfer.network import SACTransferPolicyNetwork, SACTransferTargetNetwork
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.policy.tf_policy import TFPolicy

self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
self.policy_network = SACTransferPolicyNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 3x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
num_layers=hyperparameters.value_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
separate_train=hyperparameters.separate_value_train
)
self.target_network = SACTransferTargetNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 1x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
num_layers=hyperparameters.value_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
separate_train=hyperparameters.separate_value_train
)
if not hyperparameters.separate_value_net:
self.policy_network = SACTransferPolicyNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 3x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
encoder_layers=hyperparameters.encoder_layers,
num_layers=hyperparameters.value_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
separate_train=hyperparameters.separate_value_train,
)
self.target_network = SACTransferTargetNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 1x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
encoder_layers=hyperparameters.encoder_layers,
num_layers=hyperparameters.value_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
separate_train=hyperparameters.separate_value_train,
)
else:
self.policy_network = SACPolicyNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 3x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
num_layers=num_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
)
self.target_network = SACTargetNetwork(
policy=self.policy,
m_size=self.policy.m_size, # 1x policy.m_size
h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
num_layers=num_layers,
stream_names=stream_names,
vis_encode_type=vis_encode_type,
)
# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
self.m_size = 3 * self.policy.m_size
self._create_inputs_and_outputs()

正在加载...
取消
保存