浏览代码

sac transfer implementation; disable action encoder

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
0c468084
共有 8 个文件被更改,包括 432 次插入106 次删除
  1. 76
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 12
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 106
      ml-agents/mlagents/trainers/sac_transfer/network.py
  4. 213
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  5. 43
      config/sac_transfer/3DBall.yaml
  6. 41
      config/sac_transfer/3DBallHard.yaml
  7. 47
      config/sac_transfer/3DBallHardTransfer.yaml

76
ml-agents/mlagents/trainers/policy/transfer_policy.py


if create_tf_graph:
self.create_tf_graph()
def get_trainable_variables(self) -> List[tf.Variable]:
def get_trainable_variables(self,
train_encoder: bool=True,
train_action: bool=True,
train_model: bool=True,
train_policy: bool=True) -> List[tf.Variable]:
return self.trainable_variables
trainable_variables = []
if train_encoder:
trainable_variables += self.encoding_variables
if train_action:
trainable_variables += self.action_variables
if train_model:
trainable_variables += self.model_variables
if train_policy:
trainable_variables += self.policy_variables
return trainable_variables
def create_tf_graph(
self,

# reuse_encoder,
# )
self.action_encoder = self._create_action_encoder(
self.current_action,
self.h_size,
self.action_feature_size,
action_layers
)
# self.action_encoder = self._create_action_encoder(
# self.current_action,
# self.h_size,
# self.action_feature_size,
# action_layers
# )
# if self.inverse_model:
# with tf.variable_scope("inverse"):

self.predict, self.predict_distribution = self.create_forward_model(
self.encoder,
self.action_encoder,
self.current_action,
forward_layers,
var_predict=var_predict,
)

self.action_encoder,
self.current_action,
forward_layers,
var_predict=var_predict,
reuse=True

if predict_return:
with tf.variable_scope("reward"):
self.create_reward_model(
self.encoder, self.action_encoder, forward_layers
self.encoder, self.current_action, forward_layers
)
if self.use_bisim:

self.encoder, self.h_size, policy_layers, separate_train
)
self.trainable_variables = tf.get_collection(
self.policy_variables = tf.get_collection(
self.trainable_variables += tf.get_collection(
self.encoding_variables = tf.get_collection(
self.trainable_variables += tf.get_collection(
self.action_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="action_enc"
)
self.model_variables = tf.get_collection(
) + tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="reward"
self.trainable_variables += tf.get_collection(
self.encoding_variables += tf.get_collection(
self.trainable_variables += tf.get_collection(
self.model_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse"
)

encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
encoding_saver.save(self.sess, encoding_checkpoint)
action_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc"
)
action_saver = tf.train.Saver(action_vars)
action_checkpoint = os.path.join(self.model_path, f"action_enc.ckpt")
action_saver.save(self.sess, action_checkpoint)
# action_vars = tf.get_collection(
# tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc"
# )
# action_saver = tf.train.Saver(action_vars)
# action_checkpoint = os.path.join(self.model_path, f"action_enc.ckpt")
# action_saver.save(self.sess, action_checkpoint)
latent_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent"

predict_saver.save(self.sess, predict_checkpoint)
value_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
value_saver = tf.train.Saver(value_vars)
value_checkpoint = os.path.join(self.model_path, f"value.ckpt")
value_saver.save(self.sess, value_checkpoint)
if len(value_vars) > 0:
value_saver = tf.train.Saver(value_vars)
value_checkpoint = os.path.join(self.model_path, f"value.ckpt")
value_saver.save(self.sess, value_checkpoint)
critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "critic")
if len(critic_vars) > 0:
critic_saver = tf.train.Saver(critic_vars)
critic_checkpoint = os.path.join(self.model_path, f"critic.ckpt")
critic_saver.save(self.sess, critic_checkpoint)
if self.inverse_model:
inverse_vars = tf.get_collection(

12
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.stats_name_to_update_name.update(
{"Losses/Reward Loss": "reward_loss"}
)
# if self.use_bisim:
# self.stats_name_to_update_name.update({
# "Losses/Bisim Loss": "bisim_loss",
# })
if self.use_bisim:
self.stats_name_to_update_name.update({
"Losses/Bisim Loss": "bisim_loss",
})
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
self.memory_in = tf.placeholder(

tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
self.bisim_optimizer = self.create_optimizer_op(self.bisim_learning_rate)
self.bisim_grads = self.tf_optimizer.compute_gradients(
self.bisim_grads = self.bisim_optimizer.compute_gradients(
self.bisim_update_batch = self.tf_optimizer.minimize(
self.bisim_update_batch = self.bisim_optimizer.minimize(
self.bisim_loss, var_list=bisim_train_vars
)
self.bisim_update_dict.update(

106
ml-agents/mlagents/trainers/sac_transfer/network.py


self.value_heads[name] = value
self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
def _create_cc_critic(self, hidden_value, scope, create_qs=True):
def _create_cc_critic(self, encoder, hidden_value, scope, create_qs=True):
"""
Creates just the critic network
"""

hidden_value,
encoder,
self.num_layers,
self.h_size,
self.join_scopes(scope, "value"),

num_layers=2,
stream_names=None,
vis_encode_type=EncoderType.SIMPLE,
separate_train=False
):
super().__init__(
policy,

vis_encode_type,
)
with tf.variable_scope(TARGET_SCOPE):
# self.visual_in = ModelUtils.create_visual_input_placeholders(
# policy.brain.camera_resolutions
# )
# self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
# if self.policy.normalize:
# normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
# self.update_normalization_op = normalization_tensors.update_op
# self.normalization_steps = normalization_tensors.steps
# self.running_mean = normalization_tensors.running_mean
# self.running_variance = normalization_tensors.running_variance
# self.processed_vector_in = ModelUtils.normalize_vector_obs(
# self.vector_in,
# self.running_mean,
# self.running_variance,
# self.normalization_steps,
# )
# else:
# self.processed_vector_in = self.vector_in
# self.update_normalization_op = None
self.visual_in = ModelUtils.create_visual_input_placeholders(
policy.brain.camera_resolutions
)
self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size)
if self.policy.normalize:
normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
self.update_normalization_op = normalization_tensors.update_op
self.normalization_steps = normalization_tensors.steps
self.running_mean = normalization_tensors.running_mean
self.running_variance = normalization_tensors.running_variance
self.processed_vector_in = ModelUtils.normalize_vector_obs(
self.vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_in = self.vector_in
self.update_normalization_op = None
if self.policy.use_recurrent:
self.memory_in = tf.placeholder(

# hidden_streams = ModelUtils.create_observation_streams(
# self.visual_in,
# self.processed_vector_in,
# 1,
# self.h_size,
# 0,
# vis_encode_type=vis_encode_type,
# stream_scopes=["critic/value/"],
# )
hidden_streams = ModelUtils.create_observation_streams(
self.visual_in,
self.processed_vector_in,
1,
self.h_size,
0,
vis_encode_type=vis_encode_type,
stream_scopes=["critic/value/"],
)
self._create_cc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
self._create_cc_critic(hidden_streams[0], hidden_streams[0], TARGET_SCOPE, create_qs=False)
# self._create_cc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
self._create_dc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
# self._create_dc_critic(self.policy.targ_encoder, TARGET_SCOPE, create_qs=False)
# def copy_normalization(self, mean, variance, steps):
# """
# Copies the mean, variance, and steps into the normalizers of the
# input of this SACNetwork. Used to copy the normalizer from the policy network
# to the target network.
# param mean: Tensor containing the mean.
# param variance: Tensor containing the variance
# param steps: Tensor containing the number of steps.
# """
# update_mean = tf.assign(self.running_mean, mean)
# update_variance = tf.assign(self.running_variance, variance)
# update_norm_step = tf.assign(self.normalization_steps, steps)
# return tf.group([update_mean, update_variance, update_norm_step])
def copy_normalization(self, mean, variance, steps):
"""
Copies the mean, variance, and steps into the normalizers of the
input of this SACNetwork. Used to copy the normalizer from the policy network
to the target network.
param mean: Tensor containing the mean.
param variance: Tensor containing the variance
param steps: Tensor containing the number of steps.
"""
update_mean = tf.assign(self.running_mean, mean)
update_variance = tf.assign(self.running_variance, variance)
update_norm_step = tf.assign(self.normalization_steps, steps)
return tf.group([update_mean, update_variance, update_norm_step])
class SACTransferPolicyNetwork(SACTransferNetwork):

num_layers=2,
stream_names=None,
vis_encode_type=EncoderType.SIMPLE,
separate_train=False
):
super().__init__(
policy,

# Use the sequence length of the policy
self.sequence_length_ph = self.policy.sequence_length_ph
if separate_train:
hidden = tf.stop_gradient(self.policy.encoder)
else:
hidden = self.policy.encoder
self._create_cc_critic(self.policy.encoder, POLICY_SCOPE)
self._create_cc_critic(hidden_critic, hidden_critic, POLICY_SCOPE)
# self._create_cc_critic(hidden, POLICY_SCOPE)
self._create_dc_critic(self.policy.encoder, POLICY_SCOPE)
self._create_dc_critic(hidden_critic, POLICY_SCOPE)
# self._create_dc_critic(hidden, POLICY_SCOPE)
if self.use_recurrent:
mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]

213
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


import numpy as np
from typing import Dict, List, Optional, Any, Mapping, cast
import copy
from mlagents.tf_utils import tf

class SACTransferOptimizer(TFOptimizer):
def __init__(self, policy: TransferPolicy, trainer_params: TrainerSettings):
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the
appropriate PPO agent model for the environment.

h_size=h_size,
normalize=self.policy.normalize,
use_recurrent=self.policy.use_recurrent,
num_layers=num_layers,
num_layers=hyperparameters.value_layers,
separate_train=hyperparameters.separate_value_train
)
self.target_network = SACTransferTargetNetwork(
policy=self.policy,

use_recurrent=self.policy.use_recurrent,
num_layers=num_layers,
num_layers=hyperparameters.value_layers,
separate_train=hyperparameters.separate_value_train
)
# The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
self.m_size = 3 * self.policy.m_size

int(max_step),
min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
hyperparameters.model_schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self.bisim_learning_rate = ModelUtils.create_schedule(
hyperparameters.model_schedule,
lr / 10,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy_network.q1_heads,
self.policy_network.q2_heads,

self.selected_actions = (
self.policy.selected_actions
) # For GAIL and other reward signals
# if self.policy.normalize:
# target_update_norm = self.target_network.copy_normalization(
# self.policy.running_mean,
# self.policy.running_variance,
# self.policy.normalization_steps,
# )
# # Update the normalization of the optimizer when the policy does.
# self.policy.update_normalization_op = tf.group(
# [self.policy.update_normalization_op, target_update_norm]
# )
if self.policy.normalize:
target_update_norm = self.target_network.copy_normalization(
self.policy.running_mean,
self.policy.running_variance,
self.policy.normalization_steps,
)
# Update the normalization of the optimizer when the policy does.
self.policy.update_normalization_op = tf.group(
[self.policy.update_normalization_op, target_update_norm]
)
self.policy.initialize_or_load()

self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Model Loss": "model_loss",
"Policy/Model Learning Rate": "model_learning_rate",
if self.predict_return:
self.stats_name_to_update_name.update({
"Losses/Reward Loss": "reward_loss",
})
if self.use_bisim:
self.stats_name_to_update_name.update({
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim_learning_rate",
})
self.update_dict = {
"value_loss": self.total_value_loss,
"policy_loss": self.policy_loss,

"""
self.vector_in = self.policy.vector_in
self.visual_in = self.policy.visual_in
self.next_vector_in = self.policy.vector_next
self.next_visual_in = self.policy.visual_next
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:

self.entropy = self.policy_network.entropy
self.model_loss = self.policy.forward_loss
if self.predict_return:
self.model_loss += 0.5 * self.policy.reward_loss
if self.with_prior:
if self.use_var_encoder:
self.model_loss += 0.2 * self.policy.encoder_distribution.kl_standard()
if self.use_var_predict:
self.model_loss += 0.2 * self.policy.predict_distribution.kl_standard()
if self.use_bisim:
if self.use_var_predict:
predict_diff = self.policy.predict_distribution.w_distance(
self.policy.bisim_predict_distribution
)
else:
predict_diff = tf.reduce_mean(
tf.reduce_sum(
tf.squared_difference(
self.policy.bisim_predict, self.policy.predict
),
axis=1,
)
)
if self.predict_return:
reward_diff = tf.reduce_sum(
tf.abs(self.policy.bisim_pred_reward - self.policy.pred_reward),
axis=1,
)
predict_diff = (
self.reward_signals["extrinsic"].gamma * predict_diff + reward_diff
)
encode_dist = tf.reduce_sum(
tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1
)
self.predict_difference = predict_diff
self.reward_difference = reward_diff
self.encode_difference = encode_dist
self.bisim_loss = tf.reduce_mean(
tf.squared_difference(encode_dist, predict_diff)
)
def _create_sac_optimizer_ops(self) -> None:
"""
Creates the Adam optimizers and update ops for SAC, including

value_optimizer = self.create_optimizer_op(
learning_rate=self.learning_rate, name="sac_value_opt"
)
self.target_update_op = [
tf.assign(target, (1 - self.tau) * target + self.tau * source)

]
logger.debug("value_vars")
self.print_all_vars(self.policy_network.value_vars)
logger.debug("targvalue_vars")
self.print_all_vars(self.target_network.value_vars)
logger.debug("critic_vars")
self.print_all_vars(self.policy_network.critic_vars)
logger.debug("q_vars")
self.print_all_vars(self.policy_network.q_vars)
logger.debug("policy_vars")
policy_vars = self.policy.get_trainable_variables()
self.print_all_vars(policy_vars)
policy_vars = self.policy.get_trainable_variables(
train_encoder=self.train_encoder,
train_action=self.train_action,
train_model=False,
train_policy=self.train_policy
)
model_vars = self.policy.get_trainable_variables(
train_encoder=self.train_encoder,
train_action=self.train_action,
train_model=self.train_model,
train_policy=False
)
if self.train_value:
critic_vars = self.policy_network.critic_vars + policy_vars
else:
critic_vars = policy_vars
self.target_init_op = [
tf.assign(target, source)

# Make sure policy is updated first, then value, then entropy.
with tf.control_dependencies([self.update_batch_policy]):
self.update_batch_value = value_optimizer.minimize(
self.total_value_loss, var_list=self.policy_network.critic_vars
self.total_value_loss, var_list=critic_vars
)
# Add entropy coefficient optimization operation
with tf.control_dependencies([self.update_batch_value]):

model_optimizer = self.create_optimizer_op(
learning_rate=self.model_learning_rate, name="sac_model_opt"
)
self.update_batch_model = model_optimizer.minimize(
self.model_loss, var_list=model_vars
)
self.model_update_dict.update(
{
"model_loss": self.model_loss,
"update_batch": self.update_batch_model,
"model_learning_rate": self.model_learning_rate,
}
)
if self.predict_return:
self.model_update_dict.update({"reward_loss": self.policy.reward_loss})
if self.use_bisim:
bisim_train_vars = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding"
)
self.bisim_optimizer = self.create_optimizer_op(self.bisim_learning_rate)
self.bisim_update_batch = self.bisim_optimizer.minimize(
self.bisim_loss, var_list=bisim_train_vars
)
self.bisim_update_dict.update(
{
"bisim_loss": self.bisim_loss,
"update_batch": self.bisim_update_batch,
"bisim_learning_rate": self.bisim_learning_rate,
}
)
def print_all_vars(self, variables):
for _var in variables:
logger.debug(_var)

stats_needed = self.stats_name_to_update_name
update_stats: Dict[str, float] = {}
update_vals = self._execute_model(feed_dict, self.update_dict)
update_vals.update(self._execute_model(feed_dict, self.model_update_dict))
if self.use_bisim:
batch1 = copy.deepcopy(batch)
batch.shuffle(sequence_length=1)
batch2 = copy.deepcopy(batch)
bisim_stats = self.update_encoder(batch1, batch2)
self.policy.run_soft_copy()
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer):
stats_needed = {
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim_learning_rate",
}
update_stats = {}
selected_action_1 = self.policy.sess.run(
self.policy.selected_actions,
feed_dict={self.policy.vector_in: mini_batch1["vector_obs"]},
)
selected_action_2 = self.policy.sess.run(
self.policy.selected_actions,
feed_dict={self.policy.vector_in: mini_batch2["vector_obs"]},
)
feed_dict = {
self.policy.vector_in: mini_batch1["vector_obs"],
self.policy.vector_bisim: mini_batch2["vector_obs"],
self.policy.current_action: selected_action_1,
self.policy.bisim_action: selected_action_2,
}
update_vals = self._execute_model(feed_dict, self.bisim_update_dict)
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
return update_stats
def update_reward_signals(
self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
) -> Dict[str, float]:

policy.sequence_length_ph: self.policy.sequence_length,
self.next_sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: batch["masks"] * burn_in_mask,
self.policy.current_action: batch["actions"],
self.policy.current_reward: batch["extrinsic_rewards"],
}
for name in self.reward_signals:
feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]

if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]
feed_dict[self.next_vector_in] = batch["next_vector_in"]
feed_dict[policy.vector_next] = batch["next_vector_in"]
if self.policy.vis_obs_size > 0:
for i, _ in enumerate(policy.visual_in):
_obs = batch["visual_obs%d" % i]

feed_dict[self.next_visual_in[i]] = _obs
feed_dict[policy.visual_next[i]] = _obs
if self.policy.use_recurrent:
feed_dict[policy.memory_in] = [
batch["memory"][i]

43
config/sac_transfer/3DBall.yaml


behaviors:
3DBall:
trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
model_schedule: constant
batch_size: 64
buffer_size: 12000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 0.5
reward_signal_steps_per_update: 10.0
encoder_layers: 1
policy_layers: 1
forward_layers: 1
value_layers: 2
feature_size: 16
separate_value_train: true
reuse_encoder: true
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
network_settings:
normalize: false
hidden_units: 64
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

41
config/sac_transfer/3DBallHard.yaml


behaviors:
3DBallHard:
trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
encoder_layers: 1
policy_layers: 1
forward_layers: 1
value_layers: 2
feature_size: 16
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

47
config/sac_transfer/3DBallHardTransfer.yaml


behaviors:
3DBallHard:
trainer_type: sac_transfer
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 256
buffer_size: 50000
buffer_init_steps: 0
tau: 0.005
steps_per_update: 10.0
save_replay_buffer: false
init_entcoef: 1.0
reward_signal_steps_per_update: 10.0
encoder_layers: 1
policy_layers: 1
forward_layers: 1
value_layers: 2
feature_size: 16
reuse_encoder: false
in_epoch_alter: false
in_batch_alter: true
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
use_transfer: true
load_policy: false
load_value: false
load_model: true
train_model: false
transfer_path: "results/"
network_settings:
normalize: true
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true
正在加载...
取消
保存