浏览代码

bisim

/develop/bisim-review
yanchaosun 4 年前
当前提交
cdaaa318
共有 6 个文件被更改,包括 183 次插入44 次删除
  1. 4
      config/ppo_transfer/WalkerStaticSingle.yaml
  2. 103
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  3. 104
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  4. 1
      ml-agents/mlagents/trainers/settings.py
  5. 12
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  6. 3
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

4
config/ppo_transfer/WalkerStaticSingle.yaml


hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0001
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95

inverse_layers: 1
separate_value_net: true
feature_size: 64
reuse_encoder: true
reuse_encoder: false
in_epoch_alter: true
use_op_buffer: true
network_settings:

103
ml-agents/mlagents/trainers/policy/transfer_policy.py


predict_return=False,
inverse_model=False,
reuse_encoder=False,
use_bisim=False
) -> None:
"""
Builds the tensorflow graph needed for this policy.

if predict_return:
with tf.variable_scope("reward"):
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
if use_bisim:
self.create_bisim_model(self.h_size, self.feature_size, encoder_layers,
self.vis_encode_type, forward_layers, var_predict, predict_return)
# if var_predict:
# self.predict_distribution, self.predict = self._create_var_world_model(

)
self.reward_loss = tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)
)
)
def create_bisim_model(
self,
h_size: int,
feature_size: int,
encoder_layers: int,
vis_encode_type: EncoderType,
forward_layers: int,
var_predict: bool,
predict_return: bool
) -> None:
with tf.variable_scope("encoding"):
self.visual_bisim = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_bisim = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
self.vector_bisim,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_bisim = self.vector_bisim
hidden_stream = ModelUtils.create_observation_streams(
self.visual_bisim,
self.vector_bisim,
1,
h_size,
encoder_layers,
vis_encode_type,
reuse=True
)[0]
self.bisim_encoder = tf.layers.dense(
hidden_stream,
feature_size,
name="latent",
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
reuse=True
)
combined_input = tf.concat(
[self.bisim_encoder, self.current_action], axis=1
)
with tf.variable_scope("predict"):
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size,
name="hidden_{}".format(i),
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if var_predict:
self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden,
self.feature_size
)
self.bisim_predict = self.predict_distribution.sample()
else:
self.bisim_predict = tf.layers.dense(
hidden,
self.feature_size,
name="latent",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if predict_return:
with tf.variable_scope("reward"):
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)

104
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.use_inverse_model = hyperparameters.use_inverse_model
self.predict_return = hyperparameters.predict_return
self.reuse_encoder = hyperparameters.reuse_encoder
self.use_bisim = hyperparameters.use_bisim
self.use_alter = hyperparameters.use_alter
self.in_batch_alter = hyperparameters.in_batch_alter

self.ppo_update_dict: Dict[str, tf.Tensor] = {}
self.model_update_dict: Dict[str, tf.Tensor] = {}
self.bisim_update_dict: Dict[str, tf.Tensor] = {}
self.predict_return, self.use_inverse_model, self.reuse_encoder)
self.predict_return, self.use_inverse_model, self.reuse_encoder, self.use_bisim)
with policy.graph.as_default():
super().__init__(policy, trainer_params)

self.stats_name_to_update_name.update({
"Losses/Reward Loss": "reward_loss",
})
# if self.use_bisim:
# self.stats_name_to_update_name.update({
# "Losses/Bisim Loss": "bisim_loss",
# })
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
self.memory_in = tf.placeholder(

int(max_step),
min_value=1e-10,
)
self.bisim_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy.total_log_probs,
self.old_log_probs,

)
self.returns_holders[name] = returns_holder
self.old_values[name] = old_value
self.advantage = tf.placeholder(
shape=[None], dtype=tf.float32, name="advantages"
)

if self.use_inverse_model:
self.model_loss += 0.5 * self.policy.inverse_loss
# self.model_loss = 0.2 * self.policy.forward_loss + 0.8 * self.policy.inverse_loss
if self.use_bisim:
if self.use_var_predict:
predict_diff = self.policy.predict_distribution.w_distance(self.policy.bisim_predict_distribution)
else:
predict_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_predict, self.policy.predict)
)
if self.predict_return:
reward_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
)
predict_diff = self.reward_signals["extrinsic_value"].gamma * predict_diff + reward_diff
encode_dist = tf.reduce_mean(
tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)
self.bisim_loss = tf.squared_difference(encode_dist, predict_diff)
self.loss = (
self.policy_loss
+ self.model_loss

)
def _create_ppo_optimizer_ops(self):
# if self.use_transfer:
# if self.transfer_type == "dynamics":
# if self.train_type == "all":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
# elif self.train_type == "encoding":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# elif self.train_type == "policy":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# print("trainable", train_vars)
# # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
# elif self.transfer_type == "observation":
# if self.train_type == "all":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
# elif self.train_type == "policy":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# # + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
# else:
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
train_vars = []
if self.train_encoder:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")

self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)
self.update_batch = self.tf_optimizer.minimize(self.loss, var_list=train_vars)
if self.use_bisim:
bisim_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
self.bisim_optimizer = self.create_optimizer_op(self.bisim_learning_rate)
self.bisim_grads = self.tf_optimizer.compute_gradients(self.bisim_loss, var_list=bisim_train_vars)
self.bisim_update_batch = self.tf_optimizer.minimize(self.bisim_loss, var_list=bisim_train_vars)
self.bisim_update_dict.update(
{
"bisim_loss": self.bisim_loss,
"update_batch": self.bisim_update_batch,
"bisim_learning_rate": self.bisim_learning_rate,
}
)
def _init_alter_update(self):

if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
self.num_updates += 1
return update_stats
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer, mini_batch3: AgentBuffer):
stats_needed = {
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim learning_rate",
}
update_stats = {}
feed_dict = {
self.policy.vector_in: mini_batch1["vector_in"],
self.policy.vector_bisim: mini_batch2["vector_in"],
self.policy.current_action: mini_batch3["actions"],
}
update_vals = self._execute_model(feed_dict, self.bisim_update_dict)
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
return update_stats
def _construct_feed_dict(

self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
self.policy.processed_vector_next: mini_batch["next_vector_in"],
# self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.vector_next: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.policy.current_reward: mini_batch["extrinsic_rewards"],
# self.dis_returns: mini_batch["discounted_returns"]

)
# print(self.policy.sess.run(self.policy.encoder, feed_dict={self.policy.vector_in: mini_batch["vector_obs"]}))
return feed_dict
def _create_cc_critic_old(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType

1
ml-agents/mlagents/trainers/settings.py


train_policy: bool = True
train_value: bool = True
feature_size: int = 16
use_bisim: bool = False
# Transfer
use_transfer: bool = False

12
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
feature_size=2,
feature_size=4,
reuse_encoder=True,
in_epoch_alter=True,
# in_batch_alter=True,

# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="normal", run_id="model_normal", seed=0):
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="model_rich2_f4", seed=0):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_rich2_from-rich1", seed=1337):
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_rich2_from-normal", seed=1337):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

transfer_path="./transfer_results/model_rich1_s0/Simple",
transfer_path="./transfer_results/model_normal_f4_s0/Simple",
train_policy=False, train_value=False, train_model=False, feature_size=2
train_policy=False, train_value=False, train_model=False, feature_size=4
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)

# test_2d_model(seed=0)
# test_2d_model(config=PPO_CONFIG, run_id="ppo_normal", seed=0)
# test_2d_model(config=PPO_CONFIG, run_id="ppo_rich2", seed=0)
test_2d_transfer(seed=0)
# for i in range(5):
# test_2d_model(seed=i)

3
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.uniform(-1,1))
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.positions[name] = [self.random.uniform(-1,1) for _ in range(self.action_size)]
# print("new pos:", self.positions[name])
def _make_batched_step(
self, name: str, done: bool, reward: float

正在加载...
取消
保存