浏览代码

sac update

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
0a1a30d3
共有 8 个文件被更改,包括 126 次插入43 次删除
  1. 11
      config/sac_transfer/3DBall.yaml
  2. 4
      config/sac_transfer/3DBallHard.yaml
  3. 8
      config/sac_transfer/3DBallHardTransfer.yaml
  4. 25
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  5. 28
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  6. 9
      ml-agents/mlagents/trainers/sac_transfer/trainer.py
  7. 40
      config/sac_transfer/CrawlerStatic.yaml
  8. 44
      config/sac_transfer/CrawlerStaticTransfer.yaml

11
config/sac_transfer/3DBall.yaml


trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
model_schedule: constant
learning_rate_schedule: linear
model_schedule: linear
batch_size: 64
buffer_size: 12000
buffer_init_steps: 0

forward_layers: 1
value_layers: 2
feature_size: 16
separate_value_train: true
reuse_encoder: true
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false

use_bisim: false
use_bisim: true
normalize: false
normalize: true
hidden_units: 64
num_layers: 2
vis_encode_type: simple

4
config/sac_transfer/3DBallHard.yaml


trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
learning_rate_schedule: linear
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0

use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
use_bisim: true
network_settings:
normalize: true
hidden_units: 128

8
config/sac_transfer/3DBallHardTransfer.yaml


trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
learning_rate_schedule: linear
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0

use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
use_bisim: true
load_policy: false
load_value: false
transfer_path: "results/"
transfer_path: "results/sac_model_ball_sep_bisim/3DBall"
network_settings:
normalize: true
hidden_units: 128

25
ml-agents/mlagents/trainers/policy/transfer_policy.py


forward_layers: int,
var_predict: bool = False,
reuse: bool = False,
separate_train: bool = False,
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.

"""
combined_input = tf.concat([encoded_state, encoded_action], axis=1)
hidden = combined_input
if not self.transfer:
hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(

encoded_state: tf.Tensor,
encoded_action: tf.Tensor,
forward_layers: int,
separate_train: bool = False,
# if self.transfer:
# hidden = tf.stop_gradient(hidden)
if not self.transfer:
hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,

self.bisim_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="bisim_action"
)
self.bisim_action_encoder = self._create_action_encoder(
self.bisim_action,
self.h_size,
self.action_feature_size,
action_layers,
reuse=True,
)
combined_input = tf.concat([self.bisim_encoder, self.bisim_action_encoder], axis=1)
# self.bisim_action_encoder = self._create_action_encoder(
# self.bisim_action,
# self.h_size,
# self.action_feature_size,
# action_layers,
# reuse=True,
# )
combined_input = tf.concat([self.bisim_encoder, self.bisim_action], axis=1)
combined_input = tf.stop_gradient(combined_input)
with tf.variable_scope("predict"):

28
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


self.stats_name_to_update_name.update({
"Losses/Reward Loss": "reward_loss",
})
if self.use_bisim:
self.stats_name_to_update_name.update({
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim_learning_rate",
})
self.update_dict = {
"value_loss": self.total_value_loss,

policy_vars = self.policy.get_trainable_variables(
train_encoder=self.train_encoder,
train_action=self.train_action,
train_model=False,
train_model=self.train_model,
train_policy=self.train_policy
)

train_model=self.train_model,
train_policy=False
train_policy=self.train_policy
)
if self.train_value:

# Make sure policy is updated first, then value, then entropy.
with tf.control_dependencies([self.update_batch_policy]):
self.update_batch_value = value_optimizer.minimize(
self.total_value_loss, var_list=critic_vars
self.total_value_loss, var_list=self.policy_network.critic_vars
)
# Add entropy coefficient optimization operation
with tf.control_dependencies([self.update_batch_value]):

logger.debug(_var)
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
def update(self, batch: AgentBuffer, batch_bisim: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Updates model using buffer.
:param num_sequences: Number of trajectories in batch.

feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats: Dict[str, float] = {}
update_vals = self._execute_model(feed_dict, self.update_dict)
update_vals.update(self._execute_model(feed_dict, self.model_update_dict))
if self.use_bisim:
batch1 = copy.deepcopy(batch)
batch.shuffle(sequence_length=1)
batch2 = copy.deepcopy(batch)
bisim_stats = self.update_encoder(batch1, batch2)
update_vals = self._execute_model(feed_dict, self.model_update_dict)
update_vals.update(self._execute_model(feed_dict, self.update_dict))
if self.use_bisim:
bisim_stats = self.update_encoder(batch, batch_bisim)
update_stats.update(bisim_stats)
# Update target network. By default, target update happens at every policy update.
self.sess.run(self.target_update_op)
self.policy.run_soft_copy()

9
ml-agents/mlagents/trainers/sac_transfer/trainer.py


self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
sampled_minibatch_bisim = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
sampled_minibatch[

update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
update_stats = self.optimizer.update(
sampled_minibatch,
sampled_minibatch_bisim,
n_sequences)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

40
config/sac_transfer/CrawlerStatic.yaml


behaviors:
CrawlerStatic:
trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
encoder_layers: 2
policy_layers: 1
forward_layers: 2
value_layers: 3
feature_size: 128
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 1000
summary_freq: 30000
threaded: true

44
config/sac_transfer/CrawlerStaticTransfer.yaml


behaviors:
CrawlerStatic:
trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 500000
buffer_init_steps: 2000
tau: 0.005
steps_per_update: 20.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 20.0
encoder_layers: 2
policy_layers: 1
forward_layers: 2
value_layers: 3
feature_size: 128
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
use_transfer: true
load_model: true
train_model: false
transfer_path: "results/cs-sacmod-old/3DBall"
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 3000000
time_horizon: 1000
summary_freq: 30000
threaded: true
正在加载...
取消
保存