浏览代码

new bisim implementation

/develop/bisim-review
yanchaosun 4 年前
当前提交
a212fef9
共有 6 个文件被更改,包括 99 次插入56 次删除
  1. 15
      config/ppo_transfer/CrawlerStaticP0.yaml
  2. 1
      ml-agents/mlagents/trainers/policy/tf_policy.py
  3. 55
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  4. 40
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  5. 28
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  6. 16
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py

15
config/ppo_transfer/CrawlerStaticP0.yaml


policy_layers: 2
forward_layers: 2
value_layers: 3
feature_size: 32
feature_size: 128
# use_op_buffer: true
# use_var_predict: true
# with_prior: true
# predict_return: true
use_op_buffer: true
use_var_predict: true
with_prior: true
predict_return: true
use_transfer: true
load_policy: false
load_value: false
train_model: false
transfer_path: "results/new_crawler_single_2-2-3o-2_rew_varp_f128/CrawlerStatic"
network_settings:
normalize: true
hidden_units: 512

1
ml-agents/mlagents/trainers/policy/tf_policy.py


self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.vn_update_normalization_op: Optional[tf.Operation] = None
self.bi_update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
self.total_log_probs: Optional[tf.Tensor] = None

55
ml-agents/mlagents/trainers/policy/transfer_policy.py


self.reuse_encoder = reuse_encoder
self.feature_size = feature_size
self.predict_return = predict_return
self.use_bisim = use_bisim
with self.graph.as_default():
tf.set_random_seed(self.seed)

with tf.variable_scope("reward"):
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
if use_bisim:
if self.use_bisim:
self.create_bisim_model(self.h_size, self.feature_size, encoder_layers,
self.vis_encode_type, forward_layers, var_predict, predict_return)

e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
def run_hard_copy(self):
self.sess.run(self.target_replace_op)

reward_checkpoint = os.path.join(self.model_path, f"reward.ckpt")
reward_saver.save(self.sess, reward_checkpoint)
def create_target_normalizer(self, vector_obs: tf.Tensor) -> NormalizerTensors:
def create_target_normalizer(self, vector_obs: tf.Tensor, prefix="vn") -> NormalizerTensors:
"vn_normalization_steps",
prefix+"_normalization_steps",
[],
trainable=False,
dtype=tf.int32,

"vn_running_mean",
prefix+"vn_running_mean",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

"vn_running_variance",
prefix+"vn_running_variance",
[vec_obs_size],
trainable=False,
dtype=tf.float32,

update_normalization, steps, running_mean, running_variance
)
def update_normalization(self, vector_obs: np.ndarray, vector_obs_next: np.ndarray) -> None:
def update_normalization(self, vector_obs: np.ndarray, vector_obs_next: np.ndarray, vector_obs_bisim: np.ndarray) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.

self.sess.run(
self.vn_update_normalization_op, feed_dict={self.vector_next: vector_obs_next}
)
if self.use_bisim:
self.sess.run(
self.bi_update_normalization_op, feed_dict={self.vector_bisim: vector_obs_bisim}
)
def get_encoder_weights(self):
with self.graph.as_default():

def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int,
var_predict: bool=False
var_predict: bool=False, separate_train: bool=False
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.

[encoded_state, self.current_action], axis=1
)
hidden = combined_input
if separate_train:
hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,

tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
def create_reward_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int):
def create_reward_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor,
forward_layers: int, separate_train: bool=False):
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1

if separate_train:
hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,

self.brain.camera_resolutions
)
self.vector_bisim = ModelUtils.create_vector_input(self.vec_obs_size)
# if self.normalize:
# self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
# self.vector_bisim,
# self.running_mean,
# self.running_variance,
# self.normalization_steps,
# )
# else:
# self.processed_vector_bisim = self.vector_bisim
if self.normalize:
bi_normalization_tensors = self.create_target_normalizer(self.vector_bisim)
self.bi_update_normalization_op = bi_normalization_tensors.update_op
self.bi_normalization_steps = bi_normalization_tensors.steps
self.bi_running_mean = bi_normalization_tensors.running_mean
self.bi_running_variance = bi_normalization_tensors.running_variance
self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
self.vector_bisim,
self.bi_running_mean,
self.bi_running_variance,
self.bi_normalization_steps,
)
else:
self.processed_vector_bisim = self.vector_bisim
self.vp_update_normalization_op = None
self.vector_bisim,
self.processed_vector_bisim,
1,
h_size,
encoder_layers,

40
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.ppo_update_dict: Dict[str, tf.Tensor] = {}
self.model_update_dict: Dict[str, tf.Tensor] = {}
self.model_only_update_dict: Dict[str, tf.Tensor] = {}
self.bisim_update_dict: Dict[str, tf.Tensor] = {}
# Create the graph here to give more granular control of the TF graph to the Optimizer.

self.num_updates = 0
self.alter_every = 400
self.copy_every = 10
self.copy_every = 1
self.old_loss = np.inf
self.update_mode = "model"

)
self.model_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
# ScheduleType.CONSTANT,
lr,
self.policy.global_step,
int(max_step),

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)

self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.model_only_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_only_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=model_train_vars)
self.model_only_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=model_train_vars)
self.ppo_update_dict.update(
{

}
)
self.model_only_update_dict.update(
{
"model_loss": self.model_loss,
"update_batch": self.model_only_update_batch,
"model_learning_rate": self.model_learning_rate,
}
)
if self.predict_return:
self.ppo_update_dict.update({
"reward_loss": self.policy.reward_loss,

"reward_loss": self.policy.reward_loss,
})
self.model_only_update_dict.update({
"reward_loss": self.policy.reward_loss,
})

"""
feed_dict = self._construct_feed_dict(batch, num_sequences)
stats_needed = self.stats_name_to_update_name
# if update_type == "model":
# stats_needed = {
# "Losses/Model Loss": "model_loss",
# "Policy/Learning Rate": "model_learning_rate",
# "Policy/Epsilon": "decay_epsilon",
# "Policy/Beta": "decay_beta",
# }
# elif update_type == "policy":
# stats_needed = {
# "Losses/Value Loss": "value_loss",
# "Losses/Policy Loss": "policy_loss",
# "Policy/Learning Rate": "learning_rate",
# "Policy/Epsilon": "decay_epsilon",
# "Policy/Beta": "decay_beta",
# }
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():

update_vals = self._execute_model(feed_dict, self.model_update_dict)
elif update_type == "policy":
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
elif update_type == "model_only":
update_vals = self._execute_model(feed_dict, self.model_only_update_dict)
# update target encoder
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:

28
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"], agent_buffer_trajectory["next_vector_in"])
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"],
agent_buffer_trajectory["next_vector_in"], agent_buffer_trajectory["vector_obs"])
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy(self):
def _update_policy_old(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.

batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
if self.use_iealter:
# if self.train_model:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# else:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer = self.update_buffer
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_part(
# buffer.make_mini_batch(i, i + batch_size), n_sequences, "model_only"
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
self.num_update += 1
return True

return True
def _update_policy_new(self):
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for _ in range(num_epoch):
# for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
for _ in range(num_epoch):
if self.use_bisim:
# for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.off_policy_buffer)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)

for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
max_num_batch = update_buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences

16
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


batch_size=16,
buffer_size=64,
feature_size=4,
reuse_encoder=True,
reuse_encoder=False,
in_epoch_alter=True,
# in_batch_alter=True,
use_op_buffer=True,

# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1_f4_pv-l0_rew-0.5", seed=0):
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1", seed=0):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

use_bisim=True, predict_return=True,
use_bisim=True, predict_return=True,
# separate_value_train=True, separate_policy_train=True,
use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False,
policy_layers=0, value_layers=0, encoder_layers=2, feature_size=4,

transfer_path=transfer_from,
use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False, learning_rate=5.0e-3,
train_policy=True, train_value=True, train_model=False, feature_size=4,
use_var_predict=True, with_prior=True, policy_layers=0, load_policy=False,
load_value=False, predict_return=True, value_layers=0, encoder_layers=2,
use_var_predict=True, with_prior=True, policy_layers=1, load_policy=False,
load_value=False, predict_return=True, value_layers=1, encoder_layers=1,
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=300000, summary_freq=5000)
for obs in ["normal"]: # ["normal", "rich1", "rich2"]:
for obs in ["rich1", "rich2"]: # ["normal", "rich1", "rich2"]:
+ "_f4_pv-l0_rew_bisim-op_samelen")
+ "_f4_pv-l1_rew_bisim-op_newalter_noreuse-soft0.1")
# test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# for obs in ["normal", "rich1"]:
# test_2d_transfer(seed=0, obs_spec_type="rich2",

正在加载...
取消
保存