浏览代码

merge YC changes

/develop/bisim-review
Andrew Cohen 4 年前
当前提交
5fa28f5f
共有 6 个文件被更改,包括 147 次插入333 次删除
  1. 17
      config/ppo_transfer/CrawlerStatic.yaml
  2. 17
      config/ppo_transfer/OldCrawlerStatic.yaml
  3. 309
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  4. 38
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  5. 46
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  6. 53
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py

17
config/ppo_transfer/CrawlerStatic.yaml


epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 2
learning_rate_schedule: constant
encoder_layers: 3
policy_layers: 0
forward_layers: 0
feature_size: 32
feature_size: 128
in_epoch_alter: true
in_epoch_alter: false
use_op_buffer: false
use_var_predict: true
with_prior: false
predict_return: true
use_bisim: false
separate_value_net: true
network_settings:
normalize: true
hidden_units: 512

17
config/ppo_transfer/OldCrawlerStatic.yaml


epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 2
value_layers: 3
learning_rate_schedule: constant
encoder_layers: 3
policy_layers: 0
forward_layers: 0
value_layers: 2
in_epoch_alter: true
use_op_buffer: true
in_epoch_alter: false
use_op_buffer: false
with_prior: true
with_prior: false
use_bisim: true
use_bisim: false
separate_value_net: true
network_settings:
normalize: true

309
ml-agents/mlagents/trainers/policy/transfer_policy.py


def w_distance(self, another):
return tf.sqrt(
tf.reduce_sum(tf.squared_difference(self.mu, another.mu), axis=1) \
tf.reduce_sum(tf.squared_difference(self.mu, another.mu), axis=1)
+ tf.reduce_sum(tf.squared_difference(self.sigma, another.sigma), axis=1)
)

self.feature_size = feature_size
self.predict_return = predict_return
self.use_bisim = use_bisim
self.transfer = transfer
with self.graph.as_default():
tf.set_random_seed(self.seed)

reuse_encoder,
)
if not reuse_encoder:
self.targ_encoder = tf.stop_gradient(self.targ_encoder)
self._create_hard_copy()
# if not reuse_encoder:
# self.targ_encoder = tf.stop_gradient(self.targ_encoder)
# self._create_hard_copy()
if self.inverse_model:
with tf.variable_scope("inverse"):

feature_size,
name="latent",
reuse=reuse_encoder,
activation=tf.tanh,#ModelUtils.swish,
activation=tf.tanh, # ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent_targ

hidden_stream,
feature_size,
name="latent",
activation=tf.tanh,#ModelUtils.swish,
activation=tf.tanh, # ModelUtils.swish,
def _create_var_target_encoder(
self,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
reuse_encoder: bool,
) -> tf.Tensor:
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.vector_next,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.vector_next
with tf.variable_scope(next_encoder_scope):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,
1,
h_size,
num_layers,
vis_encode_type,
reuse=reuse_encoder,
)[0]
with tf.variable_scope("latent"):
latent_targ_distribution = GaussianEncoderDistribution(
hidden_stream_targ, feature_size, reuse=reuse_encoder
)
latent_targ = latent_targ_distribution.sample()
return latent_targ_distribution, latent_targ
#def _create_var_encoder(
# self,
# visual_in: List[tf.Tensor],
# vector_in: tf.Tensor,
# h_size: int,
# feature_size: int,
# num_layers: int,
# vis_encode_type: EncoderType,
#) -> tf.Tensor:
# """
# Creates a variational encoder for visual and vector observations.
# :param h_size: Size of hidden linear layers.
# :param num_layers: Number of hidden linear layers.
# :param vis_encode_type: Type of visual encoder to use if visual input.
# :return: The hidden layer (tf.Tensor) after the encoder.
# """
# with tf.variable_scope("encoding"):
# hidden_stream = ModelUtils.create_observation_streams(
# visual_in, vector_in, 1, h_size, num_layers, vis_encode_type
# )[0]
# with tf.variable_scope("latent"):
# latent_distribution = GaussianEncoderDistribution(
# hidden_stream, feature_size
# )
# latent = latent_distribution.sample()
# return latent_distribution, latent
def _create_hard_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc")
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding")

def run_hard_copy(self):
self.sess.run(self.target_replace_op)
#def _create_inverse_model(
# def _create_inverse_model(
#) -> None:
# ) -> None:
# """
# Creates inverse model TensorFlow ops for Curiosity module.
# Predicts action taken given current and future encoded states.

:param steps: The number of steps the model was trained for
:return:
"""
#self.get_policy_weights()
# self.get_policy_weights()
with self.graph.as_default():
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)

def get_policy_weights(self):
with self.graph.as_default():
pol = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "policy/mu/bias:0")
print("policy:", self.sess.run(pol))
# pol = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "policy/mu/bias:0")
# print("policy:", self.sess.run(pol))
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding")
print("encoding:", self.sess.run(enc))
rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
print("reward:", self.sess.run(rew))
def create_encoders(
self, var_latent: bool = False, reuse_encoder: bool = False
) -> Tuple[tf.Tensor, tf.Tensor]:
encoded_state_list = []
encoded_next_state_list = []
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
if self.vis_obs_size > 0:
self.next_visual_in = []
visual_encoders = []
next_visual_encoders = []
for i in range(self.vis_obs_size):
# Create input ops for next (t+1) visual observations.
next_visual_input = ModelUtils.create_visual_input(
self.brain.camera_resolutions[i],
name="next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)
# Create the encoder ops for current and next visual input.
# Note that these encoders are siamese.
with tf.variable_scope("encoding"):
encoded_visual = ModelUtils.create_visual_observation_encoder(
self.visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
reuse_encoder,
)
visual_encoders.append(encoded_visual)
next_visual_encoders.append(encoded_next_visual)
hidden_visual = tf.concat(visual_encoders, axis=1)
hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
encoded_state_list.append(hidden_visual)
encoded_next_state_list.append(hidden_next_visual)
if self.vec_obs_size > 0:
# Create the encoder ops for current and next vector input.
# Note that these encoders are siamese.
# Create input op for next (t+1) vector observation.
self.next_vector_in = tf.placeholder(
shape=[None, self.vec_obs_size],
dtype=tf.float32,
name="next_vector_observation",
)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.next_vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.next_vector_in
with tf.variable_scope("encoding"):
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
self.processed_vector_next,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
reuse_encoder,
)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)
encoded_state = tf.concat(encoded_state_list, axis=1)
encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
if var_latent:
with tf.variable_scope("encoding/latent"):
encoded_state_dist = GaussianEncoderDistribution(
encoded_state, self.feature_size
)
encoded_state = encoded_state_dist.sample()
with tf.variable_scope(next_encoder_scope + "/latent"):
encoded_next_state_dist = GaussianEncoderDistribution(
encoded_next_state, self.feature_size, reuse=reuse_encoder
)
encoded_next_state = encoded_next_state_dist.sample()
return (
encoded_state,
encoded_next_state,
encoded_state_dist,
encoded_next_state_dist,
)
else:
with tf.variable_scope("encoding"):
encoded_state = tf.layers.dense(
encoded_state, self.feature_size, name="latent"
)
with tf.variable_scope(next_encoder_scope):
encoded_next_state = tf.layers.dense(
encoded_next_state,
self.feature_size,
name="latent",
reuse=reuse_encoder,
)
return encoded_state, encoded_next_state
# rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
# print("reward:", self.sess.run(rew))
def create_inverse_model(
self,

"""
combined_input = tf.concat([encoded_state, self.current_action], axis=1)
hidden = combined_input
if separate_train:
hidden = tf.stop_gradient(hidden)
# if self.transfer:
# hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(

self.predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size
)
self.predict = tf.tanh(self.predict_distribution.sample())
self.predict = self.predict_distribution.sample()
activation=ModelUtils.swish,
# activation=tf.tanh,
if not self.transfer:
encoded_next_state = tf.stop_gradient(encoded_next_state)
tf.squared_difference(self.predict, tf.stop_gradient(encoded_next_state)),
# tf.squared_difference(self.predict, encoded_next_state),
axis=1,
tf.squared_difference(tf.tanh(self.predict), encoded_next_state), axis=1
self.forward_loss = tf.reduce_mean(squared_difference)
# tf.dynamic_partition(squared_difference, self.mask, 2)[1]
# )
# self.forward_loss = tf.reduce_mean(squared_difference)
self.next_state = encoded_next_state
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
def create_reward_model(
self,

combined_input = tf.concat([encoded_state, self.current_action], axis=1)
hidden = combined_input
if separate_train:
hidden = tf.stop_gradient(hidden)
# if self.transfer:
# hidden = tf.stop_gradient(hidden)
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,

# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
tf.squared_difference(self.pred_reward, self.current_reward)
)
#self.reward_loss = tf.clip_by_value(
tf.squared_difference(self.pred_reward, self.current_reward)
)
# self.reward_loss = tf.clip_by_value(
#)
# )
def create_bisim_model(
self,

self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden, self.feature_size, reuse=True
)
self.bisim_predict = tf.tanh(self.predict_distribution.sample())
self.bisim_predict = self.predict_distribution.sample()
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True,
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
hidden = combined_input
for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
reuse=True,
activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.bisim_pred_reward = tf.layers.dense(
hidden,
1,
name="reward",
reuse=True
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)

38
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
# ScheduleType.LINEAR,
ScheduleType.CONSTANT,
ScheduleType.LINEAR,
# ScheduleType.CONSTANT,
lr,
self.policy.global_step,
int(max_step),

hyperparameters.load_policy,
hyperparameters.load_value,
)
#self.policy.get_encoder_weights()
#self.policy.get_policy_weights()
# self.policy.get_encoder_weights()
# self.policy.get_policy_weights()
# slim.model_analyzer.analyze_vars(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), print_info=True)

tf.reduce_sum(
tf.squared_difference(
self.policy.bisim_predict, self.policy.predict
), axis=1)
),
axis=1,
)
reward_diff = tf.reduce_sum(tf.abs(
self.policy.bisim_pred_reward - self.policy.pred_reward
), axis=1)
predict_diff = self.reward_signals[
"extrinsic"
].gamma * predict_diff + reward_diff
encode_dist = tf.reduce_sum(tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1)
reward_diff = tf.reduce_sum(
tf.abs(self.policy.bisim_pred_reward - self.policy.pred_reward),
axis=1,
)
predict_diff = (
self.reward_signals["extrinsic"].gamma * predict_diff + reward_diff
)
encode_dist = tf.reduce_sum(
tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1
)
self.bisim_loss = tf.reduce_mean(tf.squared_difference(encode_dist, predict_diff))
self.bisim_loss = tf.reduce_mean(
tf.squared_difference(encode_dist, predict_diff)
)
self.loss = (
self.policy_loss

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
if self.train_value:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
#print("trainable", train_vars)
# print("trainable", train_vars)
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)

elif self.in_batch_alter:
update_vals = self._execute_model(feed_dict, self.model_update_dict)
update_vals.update(self._execute_model(feed_dict, self.ppo_update_dict))
# print(self._execute_model(feed_dict, {"pred": self.policy.predict, "enc": self.policy.next_state}))
elif self.use_transfer and self.smart_transfer:
if self.update_mode == "model":
update_vals = self._execute_model(feed_dict, self.update_dict)

46
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
n_sequences = max(int(batch_size / self.policy.sequence_length), 1)
#self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
# self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
max_num_batch = 20 #buffer_length // batch_size
max_num_batch = 20 # buffer_length // batch_size
buffer.sample_mini_batch(batch_size, self.policy.sequence_length), n_sequences, "model"
buffer.sample_mini_batch(batch_size, self.policy.sequence_length),
n_sequences,
"model",
#buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
#)
# buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
# )
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:

batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
# if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
# self.train_model = False
# else:
# self.old_loss = np.mean(stat_list)
# if self.num_update >= 10:
# self.train_model = False
print(stat, np.mean(stat_list))
self.policy.get_encoder_weights()
# if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
# if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
# self.train_model = False
# else:
# self.old_loss = np.mean(stat_list)
# if self.num_update >= 10:
# self.train_model = False
# print(stat, np.mean(stat_list))
# self.policy.get_encoder_weights()
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()

self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
if (
self.off_policy_buffer.num_experiences
> 10 * self.hyperparameters.buffer_size
):
print("truncate")
self.off_policy_buffer.truncate(int(5 * self.hyperparameters.buffer_size))
# if (
# self.off_policy_buffer.num_experiences
# > 10 * self.hyperparameters.buffer_size
# ):
# print("truncate")
# self.off_policy_buffer.truncate(int(5 * self.hyperparameters.buffer_size))
return True

53
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


# separate_value_train=True
# separate_value_net=True,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=64),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,

# assert all(not math.isnan(reward) for reward in processed_rewards)
# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_ppo(
config=PPO_CONFIG, obs_spec_type="rich1", run_id="ppo_rich1", seed=0
):
def test_2d_ppo(config=PPO_CONFIG, obs_spec_type="rich1", run_id="ppo_rich1", seed=0):
env = SimpleTransferEnvironment(
[BRAIN_NAME],
use_discrete=False,

goal_type="hard",
)
new_hyperparams = attr.evolve(
config.hyperparameters,
batch_size=1200,
buffer_size=12000,
learning_rate=5.0e-3,
config.hyperparameters, batch_size=1200, buffer_size=12000, learning_rate=5.0e-3
)
config = attr.evolve(
config, hyperparameters=new_hyperparams, max_steps=350000, summary_freq=5000

)
def test_2d_model(
config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1", seed=0

use_op_buffer=False,
in_epoch_alter=False,
in_batch_alter=True,
policy_layers=1,
value_layers=1,
forward_layers=1,
policy_layers=0,
value_layers=2,
forward_layers=0,
feature_size=32,
feature_size=16,
# use_inverse_model=True
config, hyperparameters=new_hyperparams, max_steps=350000, summary_freq=5000
config, hyperparameters=new_hyperparams, max_steps=250000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed

learning_rate=5.0e-3,
train_policy=True,
train_value=True,
train_model=True, # YS: I tried retraining model
train_model=False,
train_encoder=True,
reuse_encoder=True,
feature_size=32,
feature_size=16,
load_model=True,
policy_layers=1,
forward_layers=1,
value_layers=1,
policy_layers=0,
value_layers=2,
forward_layers=0,
reuse_encoder=True, # YS: I added this
config, hyperparameters=new_hyperparams, max_steps=350000, summary_freq=5000
config, hyperparameters=new_hyperparams, max_steps=250000, summary_freq=5000
)
_check_environment_trains(
env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed

if __name__ == "__main__":
# for seed in range(5):
# if seed > -1:
# for obs in ["normal", "rich1", "rich2"]:
# test_2d_model(seed=seed, obs_spec_type=obs, run_id="model_" + obs)
for seed in range(5, 10):
if seed > -1:
for obs in ["normal", "rich1", "rich2"]:
test_2d_model(seed=seed, obs_spec_type=obs, run_id="model_" + obs)
# # test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# for obs in ["normal", "rich2"]:

# run_id=obs + "transfer_to_rich2",
# )
# test_2d_transfer(seed=0, obs_spec_type="longpre",
# test_2d_transfer(seed=0, obs_spec_type="longpre",
# transfer_from="./transfer_results/model_normal_s0/Simple",
# run_id="normal_transfer_to_longpre_reuse_trainmod")
正在加载...
取消
保存