浏览代码

fix normalization

/develop/bisim-review
yanchaosun 5 年前
当前提交
5a778ca3
共有 7 个文件被更改,包括 244 次插入67 次删除
  1. 8
      config/ppo_transfer/CrawlerStaticP0.yaml
  2. 3
      config/ppo_transfer/TransferCrawlerStaticP0.yaml
  3. 1
      ml-agents/mlagents/trainers/policy/tf_policy.py
  4. 97
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  5. 22
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  6. 146
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  7. 34
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py

8
config/ppo_transfer/CrawlerStaticP0.yaml


lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 3
policy_layers: 0
encoder_layers: 2
policy_layers: 2
separate_value_train: true
predict_return: true
# use_bisim: true
separate_value_net: true
network_settings:
normalize: true
hidden_units: 512

3
config/ppo_transfer/TransferCrawlerStaticP0.yaml


encoder_layers: 3
policy_layers: 0
forward_layers: 2
value_layers: 3
value_layers: 0
separate_value_train: true
in_epoch_alter: true
use_op_buffer: true
use_var_predict: true

1
ml-agents/mlagents/trainers/policy/tf_policy.py


self.running_mean: Optional[tf.Variable] = None
self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.vn_update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
self.total_log_probs: Optional[tf.Tensor] = None

97
ml-agents/mlagents/trainers/policy/transfer_policy.py


import os
import numpy as np
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import EncoderType, NormalizerTensors
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import TrainerSettings

self.inverse_model = inverse_model
self.reuse_encoder = reuse_encoder
self.feature_size = feature_size
self.predict_return = predict_return
with self.graph.as_default():
tf.set_random_seed(self.seed)

return
self.create_input_placeholders()
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
self.current_reward = tf.placeholder(
shape=[None], dtype=tf.float32, name="current_reward"
)

"observation": ["encoding", "inverse"]}
if load_model:
load_nets["dynamics"].append("predict")
if self.predict_return:
load_nets["dynamics"].append("reward")
if load_policy:
load_nets["dynamics"].append("policy")
if load_value:

)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
vn_normalization_tensors = self.create_target_normalizer(self.vector_next)
self.vn_update_normalization_op = vn_normalization_tensors.update_op
self.vn_normalization_steps = vn_normalization_tensors.steps
self.vn_running_mean = vn_normalization_tensors.running_mean
self.vn_running_variance = vn_normalization_tensors.running_variance
self.running_mean,
self.running_variance,
self.normalization_steps,
self.vn_running_mean,
self.vn_running_variance,
self.vn_normalization_steps,
self.vp_update_normalization_op = None
with tf.variable_scope(next_encoder_scope):
hidden_stream_targ = ModelUtils.create_observation_streams(

inverse_saver = tf.train.Saver(inverse_vars)
inverse_checkpoint = os.path.join(self.model_path, f"inverse.ckpt")
inverse_saver.save(self.sess, inverse_checkpoint)
if self.predict_return:
reward_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
reward_saver = tf.train.Saver(reward_vars)
reward_checkpoint = os.path.join(self.model_path, f"reward.ckpt")
reward_saver.save(self.sess, reward_checkpoint)
def create_target_normalizer(self, vector_obs: tf.Tensor) -> NormalizerTensors:
vec_obs_size = vector_obs.shape[1]
steps = tf.get_variable(
"vn_normalization_steps",
[],
trainable=False,
dtype=tf.int32,
initializer=tf.zeros_initializer(),
)
running_mean = tf.get_variable(
"vn_running_mean",
[vec_obs_size],
trainable=False,
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
running_variance = tf.get_variable(
"vn_running_variance",
[vec_obs_size],
trainable=False,
dtype=tf.float32,
initializer=tf.ones_initializer(),
)
update_normalization = ModelUtils.create_normalizer_update(
vector_obs, steps, running_mean, running_variance
)
return NormalizerTensors(
update_normalization, steps, running_mean, running_variance
)
def update_normalization(self, vector_obs: np.ndarray, vector_obs_next: np.ndarray) -> None:
"""
If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
if self.use_vec_obs and self.normalize:
self.sess.run(
self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
)
self.sess.run(
self.vn_update_normalization_op, feed_dict={self.vector_next: vector_obs_next}
)
def get_encoder_weights(self):
with self.graph.as_default():
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0")

self.brain.camera_resolutions
)
self.vector_bisim = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
self.vector_bisim,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_bisim = self.vector_bisim
# if self.normalize:
# self.processed_vector_bisim = ModelUtils.normalize_vector_obs(
# self.vector_bisim,
# self.running_mean,
# self.running_variance,
# self.normalization_steps,
# )
# else:
# self.processed_vector_bisim = self.vector_bisim
hidden_stream = ModelUtils.create_observation_streams(
self.visual_bisim,

kernel_initializer=tf.initializers.variance_scaling(1.0),
reuse=True
)
self.bisim_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="bisim_action"
)
[self.bisim_encoder, self.current_action], axis=1
[self.bisim_encoder, self.bisim_action], axis=1
combined_input = tf.stop_gradient(combined_input)
with tf.variable_scope("predict"):
hidden = combined_input
for i in range(forward_layers):

22
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


)
self.bisim_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
lr,
lr/10,
self.policy.global_step,
int(max_step),
min_value=1e-10,

self.model_loss = self.policy.forward_loss
if self.predict_return:
self.model_loss += self.policy.reward_loss
self.model_loss += 0.5 * self.policy.reward_loss
if self.with_prior:
if self.use_var_encoder:
self.model_loss += 0.2 * self.policy.encoder_distribution.kl_standard()

reward_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
)
predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + reward_diff
predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + tf.abs(reward_diff)
encode_dist = tf.reduce_mean(
tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)

return update_stats
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer, mini_batch3: AgentBuffer):
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer):
stats_needed = {
"Losses/Bisim Loss": "bisim_loss",

selected_action_1 = self.policy.sess.run(self.policy.selected_actions, feed_dict = {
self.policy.vector_in: mini_batch1["vector_obs"],
})
selected_action_2 = self.policy.sess.run(self.policy.selected_actions, feed_dict = {
self.policy.vector_in: mini_batch2["vector_obs"],
})
self.policy.current_action: mini_batch3["actions"],
self.policy.current_action: selected_action_1,
self.policy.bisim_action: selected_action_2,
# print("batch 1", mini_batch1["vector_obs"])
# print("batch 2", mini_batch2["vector_obs"])
# print("batch 3", mini_batch3["vector_obs"])
update_vals = self._execute_model(feed_dict, self.bisim_update_dict)

146
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


self.use_op_buffer = self.hyperparameters.use_op_buffer
self.conv_thres = self.hyperparameters.conv_thres
self.use_bisim = self.hyperparameters.use_bisim
self.num_check = 0
self.num_update = 0
self.train_model = True
self.old_loss = np.inf
print("The current algorithm is PPO Transfer")

agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"], agent_buffer_trajectory["next_vector_in"])
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(

Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
# if self.train_model and self.use_op_buffer:
# size_of_buffer = self.off_policy_buffer.num_experiences
# self.num_check += 1
# if self.num_check % 50 == 0 and size_of_buffer >= self.hyperparameters.buffer_size:
# return True
# else:
# return False
# else:
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer3 = copy.deepcopy(self.update_buffer)
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
buffer3.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# if self.use_bisim:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer1 = copy.deepcopy(self.update_buffer)
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer2 = copy.deepcopy(self.update_buffer)
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_encoder(
# buffer1.make_mini_batch(i, i + batch_size),
# buffer2.make_mini_batch(i, i + batch_size),
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.off_policy_buffer)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.off_policy_buffer)
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:

self._stats_reporter.add_stat(stat, val)
# self.off_policy_buffer.reset_agent()
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
print("truncate")
self.off_policy_buffer.truncate(
int(5 * self.hyperparameters.buffer_size)
)
return True
def _update_policy_new(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
update_buffer_length = self.update_buffer.num_experiences
op_buffer_length = self.off_policy_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
if self.use_iealter:
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = update_buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.off_policy_buffer)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.off_policy_buffer)
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
else:
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
print("truncate")
self.off_policy_buffer.truncate(

34
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="model_rich2_f4_pv-l0", seed=0):
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich1", run_id="model_rich1_f4_pv-l0_rew-0.5", seed=0):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

# use_bisim=False, predict_return=True, separate_value_train=True, separate_policy_train=True,
# use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False,
# policy_layers=0, value_layers=0, encoder_layers=2, feature_size=4,
use_bisim=True, predict_return=True,
# separate_value_train=True, separate_policy_train=True,
use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False,
policy_layers=0, value_layers=0, encoder_layers=2, feature_size=4,
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_f4_rich2_from-rich1-retrain-pv", seed=1337):
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich1",
transfer_from="./transfer_results/model_rich2_f4_pv-l0_rew_bisim-op_s0/Simple",
run_id="transfer_f4_rich1_from-rich2-retrain-pv_rew_bisim-op", seed=1337):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

transfer_path="./transfer_results/model_rich1_f4_pv-l0_s0/Simple",
transfer_path=transfer_from,
load_value=False,
value_layers=0, encoder_layers=2,
load_value=False, predict_return=True, value_layers=0, encoder_layers=2,
use_bisim=True,
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)

# test_2d_model(seed=0)
test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# test_2d_transfer(seed=0)
for obs in ["normal"]: # ["normal", "rich1", "rich2"]:
test_2d_model(seed=0, obs_spec_type=obs, run_id="model_" + obs \
+ "_f4_pv-l0_rew_bisim-op_samelen")
# test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# for obs in ["normal", "rich1"]:
# test_2d_transfer(seed=0, obs_spec_type="rich2",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-op_samelen_s0/Simple",
# run_id="transfer_rich2_f4_pv-l0_rew_bisim-op_samelen_from_" + obs)
# for obs in ["normal", "rich2"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-op_new_s0/Simple",
# run_id="transfer_rich1_f4_pv-l0_rew_bisim-op_new_from" + obs)
# for i in range(5):
# test_2d_model(seed=i)
正在加载...
取消
保存