浏览代码

fix bug with bisimulation

/develop/transfer-bisim
yanchaosun 4 年前
当前提交
9bc90956
共有 5 个文件被更改,包括 492 次插入213 次删除
  1. 204
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 220
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 30
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 200
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  5. 51
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

204
ml-agents/mlagents/trainers/policy/transfer_policy.py


return kl
def w_distance(self, another):
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(self.sigma, another.sigma)
return tf.reduce_mean(tf.squared_difference(self.mu, another.mu))\
+ tf.reduce_mean(tf.squared_difference(self.sigma, another.sigma))
class TransferPolicy(TFPolicy):

self.trainable_variables: List[tf.Variable] = []
self.encoder = None
self.encoder_distribution = None
self.targ_encoder = None
self.next_encoder = None
self.target_encoder = None
# Non-exposed parameters; these aren't exposed because they don't have a

self.vis_encode_type
)
self.targ_encoder = self._create_target_encoder(
self.next_encoder = self._create_next_encoder(
reuse_encoder
reuse_encoder=True
self.target_encoder = self._create_target_encoder(
self.visual_in,
self.processed_vector_in,
self.h_size,
self.feature_size,
encoder_layers,
self.vis_encode_type
)
# used to encode the next state
self.next_encoder = tf.stop_gradient(self.next_encoder)
# a stable version, used to compute the value
self.target_encoder = tf.stop_gradient(self.target_encoder)
self._create_hard_copy()
if not reuse_encoder:
self.targ_encoder = tf.stop_gradient(self.targ_encoder)
self._create_hard_copy()
self.create_inverse_model(self.encoder, self.targ_encoder, inverse_layers)
self.create_inverse_model(self.encoder, self.next_encoder, inverse_layers)
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers,
self.create_forward_model(self.encoder, self.next_encoder, forward_layers,
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
self.create_reward_model(self.encoder, self.next_encoder, forward_layers)
if self.use_bisim:
self.create_bisim_model(self.h_size, self.feature_size, encoder_layers,

return predict
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]

run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def _create_target_encoder(
def _create_next_encoder(
self,
h_size: int,
feature_size: int,

if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
next_encoder_scope = "next_enc"
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions

kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent
def _create_target_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("target_enc"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in,
vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
latent = tf.layers.dense(
hidden_stream,
feature_size,
name="latent",
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent
def _create_var_target_encoder(
self,

self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
def run_hard_copy(self):
# print("before:")
# self.get_encoder_weights()
self.sess.run(self.target_replace_op)
def _create_inverse_model(

rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
print("reward:", self.sess.run(rew))
def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
encoded_state_list = []
encoded_next_state_list = []
if reuse_encoder:
next_encoder_scope = "encoding"
else:
next_encoder_scope = "target_enc"
if self.vis_obs_size > 0:
self.next_visual_in = []
visual_encoders = []
next_visual_encoders = []
for i in range(self.vis_obs_size):
# Create input ops for next (t+1) visual observations.
next_visual_input = ModelUtils.create_visual_input(
self.brain.camera_resolutions[i],
name="next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)
# Create the encoder ops for current and next visual input.
# Note that these encoders are siamese.
with tf.variable_scope("encoding"):
encoded_visual = ModelUtils.create_visual_observation_encoder(
self.visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"stream_{}_visual_obs_encoder".format(i),
reuse_encoder
)
visual_encoders.append(encoded_visual)
next_visual_encoders.append(encoded_next_visual)
hidden_visual = tf.concat(visual_encoders, axis=1)
hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
encoded_state_list.append(hidden_visual)
encoded_next_state_list.append(hidden_next_visual)
if self.vec_obs_size > 0:
# Create the encoder ops for current and next vector input.
# Note that these encoders are siamese.
# Create input op for next (t+1) vector observation.
self.next_vector_in = tf.placeholder(
shape=[None, self.vec_obs_size],
dtype=tf.float32,
name="next_vector_observation",
)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.next_vector_in,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.next_vector_in
with tf.variable_scope("encoding"):
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
False,
)
with tf.variable_scope(next_encoder_scope):
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
self.processed_vector_next,
self.h_size,
ModelUtils.swish,
self.num_layers,
"vector_obs_encoder",
reuse_encoder
)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)
encoded_state = tf.concat(encoded_state_list, axis=1)
encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
if var_latent:
with tf.variable_scope("encoding/latent"):
encoded_state_dist = GaussianEncoderDistribution(
encoded_state,
self.feature_size,
)
encoded_state = encoded_state_dist.sample()
with tf.variable_scope(next_encoder_scope+"/latent"):
encoded_next_state_dist = GaussianEncoderDistribution(
encoded_next_state,
self.feature_size,
reuse=reuse_encoder
)
encoded_next_state = encoded_next_state_dist.sample()
return encoded_state, encoded_next_state, encoded_state_dist, encoded_next_state_dist
else:
with tf.variable_scope("encoding"):
encoded_state = tf.layers.dense(
encoded_state,
self.feature_size,
name="latent"
)
with tf.variable_scope(next_encoder_scope):
encoded_next_state = tf.layers.dense(
encoded_next_state,
self.feature_size,
name="latent",
reuse=reuse_encoder
)
return encoded_state, encoded_next_state
def create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, inverse_layers: int
) -> None:

:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
self.current_action = tf.stop_gradient(self.current_action)
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1
)

)
self.reward_loss = tf.clip_by_value(tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)
), 1e-10,1.0)
), 1e-10, 1e10)
def create_bisim_model(
self,

220
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


from typing import Optional, Any, Dict, cast
from typing import Optional, Any, Dict, cast, List, Tuple
import numpy as np
import os
import copy

from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.transfer_policy import TransferPolicy

else:
self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
with tf.variable_scope("target_value"):
if policy.use_continuous_act:
self._create_cc_critic_target(h_size, hyperparameters.value_layers, vis_encode_type)
else:
self._create_dc_critic_target(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_soft_critic_copy()
with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(
self._schedule,

min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
# ScheduleType.LINEAR,
ScheduleType.CONSTANT,
self._schedule,
lr,
self.policy.global_step,
int(max_step),

ScheduleType.CONSTANT,
lr/10,
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,

self.old_log_probs,
self.value_heads,
self.policy.entropy,
self.policy.targ_encoder,
self.policy.next_encoder,
self.policy.predict,
beta,
epsilon,

if self.use_transfer:
self.policy.load_graph_partial(self.transfer_path, self.transfer_type,
hyperparameters.load_model, hyperparameters.load_policy, hyperparameters.load_value)
self.run_soft_critic_copy()
self.policy.get_encoder_weights()
self.policy.get_policy_weights()

)
def _create_losses(
self, probs, old_probs, value_heads, entropy, targ_encoder, predict,
self, probs, old_probs, value_heads, entropy, next_encoder, predict,
beta, epsilon, lr, max_step
):
"""

# For cleaner stats reporting
self.abs_policy_loss = tf.abs(self.policy_loss)
# encoder and predict loss
# self.dis_returns = tf.placeholder(
# shape=[None], dtype=tf.float32, name="dis_returns"
# )
# target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
# if self.predict_return:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# else:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# if self.with_prior:
# if self.use_var_encoder:
# self.model_loss += encoder_distribution.kl_standard()
# if self.use_var_predict:
# self.model_loss += self.policy.predict_distribution.kl_standard()
self.model_loss = self.policy.forward_loss
if self.predict_return:
self.model_loss += 0.5 * self.policy.reward_loss

reward_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
)
predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + tf.abs(reward_diff)
predict_diff = 0.99 * predict_diff + tf.abs(reward_diff)
tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
tf.abs(self.policy.encoder - self.policy.bisim_encoder)
self.encode_dist_val = encode_dist
self.predict_diff_val = predict_diff
self.bisim_loss = tf.squared_difference(encode_dist, predict_diff)
self.loss = (

update_vals = self._execute_model(feed_dict, self.update_dict)
# update target encoder
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
if self.num_updates % self.copy_every == 0:
self.run_soft_critic_copy()
# print("copy")
# self.policy.get_encoder_weights()

update_vals = self._execute_model(feed_dict, self.model_only_update_dict)
# update target encoder
if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
if self.num_updates % self.copy_every == 0:
self.run_soft_critic_copy()
# print("copy")
# self.policy.get_encoder_weights()
self.num_updates += 1
return update_stats
def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer):

}
update_vals = self._execute_model(feed_dict, self.bisim_update_dict)
# print("model difference:", self.policy.sess.run(self.predict_diff_val, feed_dict=feed_dict))
# print("encoder distance:", self.policy.sess.run(self.encode_dist_val, feed_dict=feed_dict))
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():

keepdims=True,
)
def _create_cc_critic_target(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
input_state = self.policy.target_encoder
hidden_value = ModelUtils.create_vector_observation_encoder(
input_state,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
for name in self.stream_names:
self.target_value_heads[name] = tf.stop_gradient(self.target_value_heads[name])
self.target_all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
self.target_old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
def _create_dc_critic_target(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
input_state = self.policy.target_encoder
hidden_value = ModelUtils.create_vector_observation_encoder(
input_state,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
for name in self.stream_names:
self.target_value_heads[name] = tf.stop_gradient(self.target_value_heads[name])
# self.target_value[name] = tf.stop_gradient(self.target_value[name])
self.target_all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
# Break old log probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.target_all_old_log_probs, self.policy.act_size
)
_, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
old_log_prob_branches, self.policy.action_masks, self.policy.act_size
)
action_idx = [0] + list(np.cumsum(self.policy.act_size))
self.target_old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.policy.selected_actions[
:, action_idx[i] : action_idx[i + 1]
],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: batch.num_experiences,
self.policy.sequence_length_ph: batch.num_experiences, # We want to feed data in batch-wise, not time-wise.
}
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = batch["vector_obs"]
if self.policy.vis_obs_size > 0:
for i in range(len(self.policy.visual_in)):
_obs = batch["visual_obs%d" % i]
feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = [
np.zeros((self.policy.m_size), dtype=np.float32)
]
feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = batch["prev_action"]
if self.policy.use_recurrent:
value_estimates, policy_mem, value_mem = self.sess.run(
[self.target_value_heads, self.policy.memory_out, self.memory_out], feed_dict
)
prev_action = (
batch["actions"][-1] if not self.policy.use_continuous_act else None
)
else:
value_estimates = self.sess.run(self.target_value_heads, feed_dict)
prev_action = None
policy_mem = None
value_mem = None
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
# We do this in a separate step to feed the memory outs - a further optimization would
# be to append to the obs before running sess.run.
final_value_estimates = self._get_value_estimates(
next_obs, done, policy_mem, value_mem, prev_action
)
return value_estimates, final_value_estimates
def _create_soft_critic_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_value')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='value')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
def run_soft_critic_copy(self):
with self.policy.graph.as_default():
# print("before")
# val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "value/extrinsic_value/bias:0")
# print("value:", self.sess.run(val))
# target_val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_value/extrinsic_value/bias:0")
# print("target_val:", self.sess.run(target_val))
self.policy.sess.run(self.target_replace_op)
# print("copy")
# val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "value/extrinsic_value/bias:0")
# print("value:", self.sess.run(val))
# target_val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_value/extrinsic_value/bias:0")
# print("target_val:", self.sess.run(target_val))

30
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy_old(self):
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# if self.use_bisim:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer1 = copy.deepcopy(self.update_buffer)
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer2 = copy.deepcopy(self.update_buffer)
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_encoder(
# buffer1.make_mini_batch(i, i + batch_size),
# buffer2.make_mini_batch(i, i + batch_size),
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.update_buffer)
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

return True
def _update_policy(self):
def _update_policy_new(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.

200
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

51
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


self.action[name] = None
self.step_result[name] = None
self.step_count[name] = 0
self.horizon[name] = 1000
self.horizon[name] = 5000
print(self.goal)
def _make_obs_spec(self) -> List[Any]:

for _ in range(self.num_vector):
obs_spec.append((self.vec_obs_size,))
# composed position
if "rich" in self.obs_spec_type:
elif "rich" in self.obs_spec_type:
for _ in range(self.num_vector+1):
obs_spec.append((self.vec_obs_size,))
print("obs_spec:", obs_spec)

obs = []
if self.obs_spec_type == "compact":
for name in self.names:
for pos, goal in zip(self.positions[name], value):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (goal-pos))
return obs
if self.obs_spec_type == "normal":
for name in self.names:
for i in self.positions[name]:

if self.goal_type == "easy":
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) or self.step_count[name] >= self.horizon[name]
elif self.goal_type == "hard":
done = self.step_count[name] >= self.horizon[name]
# done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
# or self.step_count[name] >= self.horizon[name]
# done = self.step_count[name] >= self.horizon[name]
done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
or self.step_count[name] >= self.horizon[name]
# if done:
# print(self.positions[name], end=" done ")
return done

def _compute_reward(self, name: str, done: bool) -> float:
reward = 0.0
for _pos, goal in zip(self.positions[name], self.goal[name]):
# if abs(_pos - self.goal[name]) < 0.1:
# reward += SUCCESS_REWARD
# else:
# reward -= TIME_PENALTY
reward += 2 - abs(_pos - goal) #np.exp(-abs(_pos - goal))
# for _pos, goal in zip(self.positions[name], self.goal[name]):
# # if abs(_pos - self.goal[name]) < 0.1:
# # reward += SUCCESS_REWARD
# # else:
# # reward -= TIME_PENALTY
# reward -= abs(_pos - goal) #np.exp(-abs(_pos - goal))
# if done:
# reward = SUCCESS_REWARD
# # for _pos in self.positions[name]:
# # if self.goal_type == "easy":
# # reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# # self.positions[name]
# # )
# # elif self.goal_type == "hard":
# # reward += np.exp(-abs(_pos - self.goal[name]))
# else:
# reward = -TIME_PENALTY
if done and self.step_count[name] < self.horizon[name]:
reward = SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# self.positions[name]
# )
# elif self.goal_type == "hard":
# reward += np.exp(-abs(_pos - self.goal[name]))
else:
reward = -TIME_PENALTY
return reward

正在加载...
取消
保存