|
|
|
|
|
|
GaussianDistribution, |
|
|
|
MultiCategoricalDistribution, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def __init__( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
feature_size: int, |
|
|
|
reuse: bool=False |
|
|
|
): |
|
|
|
def __init__(self, encoded: tf.Tensor, feature_size: int, reuse: bool = False): |
|
|
|
self.mu = tf.layers.dense( |
|
|
|
encoded, |
|
|
|
feature_size, |
|
|
|
|
|
|
activation=None, |
|
|
|
name="log_std", |
|
|
|
kernel_initializer=ModelUtils.scaled_init(0.01), |
|
|
|
reuse=reuse |
|
|
|
reuse=reuse, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1) |
|
|
|
kl = 0.5 * tf.reduce_sum( |
|
|
|
tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1 |
|
|
|
) |
|
|
|
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(self.sigma, another.sigma) |
|
|
|
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference( |
|
|
|
self.sigma, another.sigma |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
class TransferPolicy(TFPolicy): |
|
|
|
|
|
|
self.encoder_distribution = None |
|
|
|
self.targ_encoder = None |
|
|
|
|
|
|
|
|
|
|
|
# Non-exposed parameters; these aren't exposed because they don't have a |
|
|
|
# good explanation and usually shouldn't be touched. |
|
|
|
self.log_std_min = -20 |
|
|
|
|
|
|
""" |
|
|
|
return self.trainable_variables |
|
|
|
|
|
|
|
def create_tf_graph(self, |
|
|
|
encoder_layers = 1, |
|
|
|
policy_layers = 1, |
|
|
|
forward_layers = 1, |
|
|
|
inverse_layers = 1, |
|
|
|
feature_size = 16, |
|
|
|
transfer=False, |
|
|
|
separate_train=False, |
|
|
|
def create_tf_graph( |
|
|
|
self, |
|
|
|
encoder_layers=1, |
|
|
|
policy_layers=1, |
|
|
|
forward_layers=1, |
|
|
|
inverse_layers=1, |
|
|
|
feature_size=16, |
|
|
|
transfer=False, |
|
|
|
separate_train=False, |
|
|
|
var_predict=False, |
|
|
|
predict_return=False, |
|
|
|
var_predict=True, |
|
|
|
predict_return=True, |
|
|
|
reuse_encoder=False, |
|
|
|
use_bisim=False |
|
|
|
reuse_encoder=True, |
|
|
|
use_bisim=True, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Builds the tensorflow graph needed for this policy. |
|
|
|
|
|
|
return |
|
|
|
self.create_input_placeholders() |
|
|
|
self.current_action = tf.placeholder( |
|
|
|
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action" |
|
|
|
shape=[None, sum(self.act_size)], |
|
|
|
dtype=tf.float32, |
|
|
|
name="current_action", |
|
|
|
) |
|
|
|
self.current_reward = tf.placeholder( |
|
|
|
shape=[None], dtype=tf.float32, name="current_reward" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if var_encoder: |
|
|
|
self.encoder_distribution, self.encoder = self._create_var_encoder( |
|
|
|
self.visual_in, |
|
|
|
|
|
|
encoder_layers, |
|
|
|
self.vis_encode_type |
|
|
|
self.vis_encode_type, |
|
|
|
) |
|
|
|
|
|
|
|
_, self.targ_encoder = self._create_var_target_encoder( |
|
|
|
|
|
|
self.vis_encode_type, |
|
|
|
reuse_encoder |
|
|
|
reuse_encoder, |
|
|
|
) |
|
|
|
else: |
|
|
|
self.encoder = self._create_encoder( |
|
|
|
|
|
|
self.feature_size, |
|
|
|
encoder_layers, |
|
|
|
self.vis_encode_type |
|
|
|
self.vis_encode_type, |
|
|
|
) |
|
|
|
|
|
|
|
self.targ_encoder = self._create_target_encoder( |
|
|
|
|
|
|
self.vis_encode_type, |
|
|
|
reuse_encoder |
|
|
|
reuse_encoder, |
|
|
|
|
|
|
|
|
|
|
|
self.create_inverse_model(self.encoder, self.targ_encoder, inverse_layers) |
|
|
|
self.create_inverse_model( |
|
|
|
self.encoder, self.targ_encoder, inverse_layers |
|
|
|
) |
|
|
|
self.create_forward_model(self.encoder, self.targ_encoder, forward_layers, |
|
|
|
var_predict=var_predict) |
|
|
|
|
|
|
|
self.create_forward_model( |
|
|
|
self.encoder, |
|
|
|
self.targ_encoder, |
|
|
|
forward_layers, |
|
|
|
var_predict=var_predict, |
|
|
|
) |
|
|
|
|
|
|
|
self.create_reward_model(self.encoder, self.targ_encoder, forward_layers) |
|
|
|
|
|
|
|
self.create_reward_model( |
|
|
|
self.encoder, self.targ_encoder, forward_layers |
|
|
|
) |
|
|
|
|
|
|
|
self.create_bisim_model(self.h_size, self.feature_size, encoder_layers, |
|
|
|
self.vis_encode_type, forward_layers, var_predict, predict_return) |
|
|
|
self.create_bisim_model( |
|
|
|
self.h_size, |
|
|
|
self.feature_size, |
|
|
|
encoder_layers, |
|
|
|
self.vis_encode_type, |
|
|
|
forward_layers, |
|
|
|
var_predict, |
|
|
|
predict_return, |
|
|
|
) |
|
|
|
|
|
|
|
if self.use_continuous_act: |
|
|
|
self._create_cc_actor( |
|
|
|
|
|
|
self.tanh_squash, |
|
|
|
self.reparameterize, |
|
|
|
self.condition_sigma_on_obs, |
|
|
|
separate_train |
|
|
|
separate_train, |
|
|
|
self._create_dc_actor(self.encoder, self.h_size, policy_layers, separate_train) |
|
|
|
self._create_dc_actor( |
|
|
|
self.encoder, self.h_size, policy_layers, separate_train |
|
|
|
) |
|
|
|
|
|
|
|
self.trainable_variables = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy" |
|
|
|
|
|
|
self._initialize_graph() |
|
|
|
|
|
|
|
# slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True) |
|
|
|
|
|
|
|
def load_graph_partial(self, path: str, transfer_type="dynamics", load_model=True, load_policy=True, |
|
|
|
load_value=True): |
|
|
|
load_nets = {"dynamics": [], |
|
|
|
"observation": ["encoding", "inverse"]} |
|
|
|
|
|
|
|
def load_graph_partial( |
|
|
|
self, |
|
|
|
path: str, |
|
|
|
transfer_type="dynamics", |
|
|
|
load_model=True, |
|
|
|
load_policy=True, |
|
|
|
load_value=True, |
|
|
|
): |
|
|
|
load_nets = {"dynamics": [], "observation": ["encoding", "inverse"]} |
|
|
|
if load_model: |
|
|
|
load_nets["dynamics"].append("predict") |
|
|
|
if self.predict_return: |
|
|
|
|
|
|
load_nets["dynamics"].append("value") |
|
|
|
if self.inverse_model: |
|
|
|
load_nets["dynamics"].append("inverse") |
|
|
|
|
|
|
|
|
|
|
|
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net) |
|
|
|
variables_to_restore = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, net |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if transfer_type == "observation": |
|
|
|
self.run_hard_copy() |
|
|
|
|
|
|
|
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
predict_return: bool=False |
|
|
|
predict_return: bool = False, |
|
|
|
) -> tf.Tensor: |
|
|
|
"""" |
|
|
|
Builds the world model for state prediction |
|
|
|
|
|
|
ModelUtils.swish, |
|
|
|
num_layers, |
|
|
|
scope=f"main_graph", |
|
|
|
reuse=False |
|
|
|
reuse=False, |
|
|
|
hidden_stream, |
|
|
|
feature_size+1, |
|
|
|
name="next_state" |
|
|
|
hidden_stream, feature_size + 1, name="next_state" |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="next_state" |
|
|
|
hidden_stream, feature_size, name="next_state" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@timed |
|
|
|
def evaluate( |
|
|
|
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
reuse_encoder: bool |
|
|
|
reuse_encoder: bool, |
|
|
|
) -> tf.Tensor: |
|
|
|
if reuse_encoder: |
|
|
|
next_encoder_scope = "encoding" |
|
|
|
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
reuse=reuse_encoder |
|
|
|
reuse=reuse_encoder, |
|
|
|
hidden_stream_targ, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
reuse=reuse_encoder, |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
hidden_stream_targ, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
reuse=reuse_encoder, |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def _create_encoder( |
|
|
|
self, |
|
|
|
visual_in: List[tf.Tensor], |
|
|
|
|
|
|
""" |
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
hidden_stream = ModelUtils.create_observation_streams( |
|
|
|
visual_in, |
|
|
|
vector_in, |
|
|
|
1, |
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def _create_var_target_encoder( |
|
|
|
self, |
|
|
|
h_size: int, |
|
|
|
|
|
|
reuse_encoder: bool |
|
|
|
reuse_encoder: bool, |
|
|
|
) -> tf.Tensor: |
|
|
|
if reuse_encoder: |
|
|
|
next_encoder_scope = "encoding" |
|
|
|
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
reuse=reuse_encoder |
|
|
|
reuse=reuse_encoder, |
|
|
|
hidden_stream_targ, |
|
|
|
feature_size, |
|
|
|
reuse=reuse_encoder |
|
|
|
hidden_stream_targ, feature_size, reuse=reuse_encoder |
|
|
|
) |
|
|
|
|
|
|
|
latent_targ = latent_targ_distribution.sample() |
|
|
|
|
|
|
h_size: int, |
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
) -> tf.Tensor: |
|
|
|
""" |
|
|
|
Creates a variational encoder for visual and vector observations. |
|
|
|
|
|
|
|
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
hidden_stream = ModelUtils.create_observation_streams( |
|
|
|
visual_in, |
|
|
|
vector_in, |
|
|
|
1, |
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
visual_in, vector_in, 1, h_size, num_layers, vis_encode_type |
|
|
|
hidden_stream, |
|
|
|
feature_size |
|
|
|
hidden_stream, feature_size |
|
|
|
|
|
|
|
|
|
|
|
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc') |
|
|
|
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding') |
|
|
|
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_enc") |
|
|
|
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoding") |
|
|
|
with tf.variable_scope('hard_replacement'): |
|
|
|
self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)] |
|
|
|
with tf.variable_scope("hard_replacement"): |
|
|
|
self.target_replace_op = [ |
|
|
|
tf.assign(t, 0.9 * t + 0.1 * e) for t, e in zip(t_params, e_params) |
|
|
|
] |
|
|
|
|
|
|
|
def run_hard_copy(self): |
|
|
|
self.sess.run(self.target_replace_op) |
|
|
|
|
|
|
""" |
|
|
|
with tf.variable_scope("inverse"): |
|
|
|
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) |
|
|
|
hidden = tf.layers.dense(combined_input, self.h_size, activation=ModelUtils.swish) |
|
|
|
hidden = tf.layers.dense( |
|
|
|
combined_input, self.h_size, activation=ModelUtils.swish |
|
|
|
) |
|
|
|
pred_action = tf.layers.dense( |
|
|
|
hidden, self.act_size[0], activation=None |
|
|
|
) |
|
|
|
pred_action = tf.layers.dense(hidden, self.act_size[0], activation=None) |
|
|
|
squared_difference = tf.reduce_sum( |
|
|
|
tf.squared_difference(pred_action, self.current_action), axis=1 |
|
|
|
) |
|
|
|
|
|
|
self.inverse_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(cross_entropy, self.mask, 2)[1] |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def _create_cc_actor( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
|
|
|
reparameterize: bool = False, |
|
|
|
condition_sigma_on_obs: bool = True, |
|
|
|
separate_train: bool = False |
|
|
|
separate_train: bool = False, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates Continuous control actor-critic model. |
|
|
|
|
|
|
self.total_log_probs = distribution.total_log_probs |
|
|
|
|
|
|
|
def _create_dc_actor( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
h_size: int, |
|
|
|
num_layers: int, |
|
|
|
separate_train: bool = False |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
h_size: int, |
|
|
|
num_layers: int, |
|
|
|
separate_train: bool = False, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates Discrete control actor-critic model. |
|
|
|
|
|
|
policy_checkpoint = os.path.join(self.model_path, f"policy.ckpt") |
|
|
|
policy_saver.save(self.sess, policy_checkpoint) |
|
|
|
|
|
|
|
encoding_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding") |
|
|
|
encoding_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding" |
|
|
|
) |
|
|
|
latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent") |
|
|
|
latent_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent" |
|
|
|
) |
|
|
|
predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") |
|
|
|
predict_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "predict" |
|
|
|
) |
|
|
|
predict_saver = tf.train.Saver(predict_vars) |
|
|
|
predict_checkpoint = os.path.join(self.model_path, f"predict.ckpt") |
|
|
|
predict_saver.save(self.sess, predict_checkpoint) |
|
|
|
|
|
|
value_saver.save(self.sess, value_checkpoint) |
|
|
|
|
|
|
|
if self.inverse_model: |
|
|
|
inverse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") |
|
|
|
inverse_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "inverse" |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
reward_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward") |
|
|
|
reward_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "reward" |
|
|
|
) |
|
|
|
def create_target_normalizer(self, vector_obs: tf.Tensor, prefix="vn") -> NormalizerTensors: |
|
|
|
def create_target_normalizer( |
|
|
|
self, vector_obs: tf.Tensor, prefix="vn" |
|
|
|
) -> NormalizerTensors: |
|
|
|
prefix+"_normalization_steps", |
|
|
|
prefix + "_normalization_steps", |
|
|
|
[], |
|
|
|
trainable=False, |
|
|
|
dtype=tf.int32, |
|
|
|
|
|
|
prefix+"vn_running_mean", |
|
|
|
prefix + "vn_running_mean", |
|
|
|
[vec_obs_size], |
|
|
|
trainable=False, |
|
|
|
dtype=tf.float32, |
|
|
|
|
|
|
prefix+"vn_running_variance", |
|
|
|
prefix + "vn_running_variance", |
|
|
|
[vec_obs_size], |
|
|
|
trainable=False, |
|
|
|
dtype=tf.float32, |
|
|
|
|
|
|
return NormalizerTensors( |
|
|
|
update_normalization, steps, running_mean, running_variance |
|
|
|
) |
|
|
|
|
|
|
|
def update_normalization(self, vector_obs: np.ndarray, vector_obs_next: np.ndarray, vector_obs_bisim: np.ndarray) -> None: |
|
|
|
|
|
|
|
def update_normalization( |
|
|
|
self, |
|
|
|
vector_obs: np.ndarray, |
|
|
|
vector_obs_next: np.ndarray, |
|
|
|
vector_obs_bisim: np.ndarray, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
If this policy normalizes vector observations, this will update the norm values in the graph. |
|
|
|
:param vector_obs: The vector observations to add to the running estimate of the distribution. |
|
|
|
|
|
|
self.update_normalization_op, feed_dict={self.vector_in: vector_obs} |
|
|
|
) |
|
|
|
self.sess.run( |
|
|
|
self.vn_update_normalization_op, feed_dict={self.vector_next: vector_obs_next} |
|
|
|
self.vn_update_normalization_op, |
|
|
|
feed_dict={self.vector_next: vector_obs_next}, |
|
|
|
self.bi_update_normalization_op, feed_dict={self.vector_bisim: vector_obs_bisim} |
|
|
|
self.bi_update_normalization_op, |
|
|
|
feed_dict={self.vector_bisim: vector_obs_bisim}, |
|
|
|
|
|
|
|
|
|
|
|
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0") |
|
|
|
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0") |
|
|
|
enc = tf.get_collection( |
|
|
|
tf.GraphKeys.GLOBAL_VARIABLES, "encoding/latent/bias:0" |
|
|
|
) |
|
|
|
targ = tf.get_collection( |
|
|
|
tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/latent/bias:0" |
|
|
|
) |
|
|
|
print("encoding:", self.sess.run(enc)) |
|
|
|
print("target:", self.sess.run(targ)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward") |
|
|
|
print("reward:", self.sess.run(rew)) |
|
|
|
|
|
|
|
def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]: |
|
|
|
|
|
|
|
def create_encoders( |
|
|
|
self, var_latent: bool = False, reuse_encoder: bool = False |
|
|
|
) -> Tuple[tf.Tensor, tf.Tensor]: |
|
|
|
encoded_state_list = [] |
|
|
|
encoded_next_state_list = [] |
|
|
|
if reuse_encoder: |
|
|
|
|
|
|
"stream_{}_visual_obs_encoder".format(i), |
|
|
|
False, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
with tf.variable_scope(next_encoder_scope): |
|
|
|
encoded_next_visual = ModelUtils.create_visual_observation_encoder( |
|
|
|
self.next_visual_in[i], |
|
|
|
|
|
|
"stream_{}_visual_obs_encoder".format(i), |
|
|
|
reuse_encoder |
|
|
|
reuse_encoder, |
|
|
|
) |
|
|
|
|
|
|
|
visual_encoders.append(encoded_visual) |
|
|
|
|
|
|
ModelUtils.swish, |
|
|
|
self.num_layers, |
|
|
|
"vector_obs_encoder", |
|
|
|
reuse_encoder |
|
|
|
reuse_encoder, |
|
|
|
) |
|
|
|
encoded_state_list.append(encoded_vector_obs) |
|
|
|
encoded_next_state_list.append(encoded_next_vector_obs) |
|
|
|
|
|
|
if var_latent: |
|
|
|
with tf.variable_scope("encoding/latent"): |
|
|
|
encoded_state_dist = GaussianEncoderDistribution( |
|
|
|
encoded_state, |
|
|
|
self.feature_size, |
|
|
|
encoded_state, self.feature_size |
|
|
|
|
|
|
|
with tf.variable_scope(next_encoder_scope+"/latent"): |
|
|
|
|
|
|
|
with tf.variable_scope(next_encoder_scope + "/latent"): |
|
|
|
encoded_next_state, |
|
|
|
self.feature_size, |
|
|
|
reuse=reuse_encoder |
|
|
|
encoded_next_state, self.feature_size, reuse=reuse_encoder |
|
|
|
return encoded_state, encoded_next_state, encoded_state_dist, encoded_next_state_dist |
|
|
|
return ( |
|
|
|
encoded_state, |
|
|
|
encoded_next_state, |
|
|
|
encoded_state_dist, |
|
|
|
encoded_next_state_dist, |
|
|
|
) |
|
|
|
encoded_state, |
|
|
|
self.feature_size, |
|
|
|
name="latent" |
|
|
|
) |
|
|
|
encoded_state, self.feature_size, name="latent" |
|
|
|
) |
|
|
|
encoded_next_state, |
|
|
|
self.feature_size, |
|
|
|
name="latent", |
|
|
|
reuse=reuse_encoder |
|
|
|
) |
|
|
|
encoded_next_state, |
|
|
|
self.feature_size, |
|
|
|
name="latent", |
|
|
|
reuse=reuse_encoder, |
|
|
|
) |
|
|
|
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, inverse_layers: int |
|
|
|
self, |
|
|
|
encoded_state: tf.Tensor, |
|
|
|
encoded_next_state: tf.Tensor, |
|
|
|
inverse_layers: int, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates inverse model TensorFlow ops for Curiosity module. |
|
|
|
|
|
|
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) |
|
|
|
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) |
|
|
|
hidden = combined_input |
|
|
|
for i in range(inverse_layers-1): |
|
|
|
for i in range(inverse_layers - 1): |
|
|
|
hidden = tf.layers.dense( |
|
|
|
hidden, |
|
|
|
self.h_size, |
|
|
|
|
|
|
pred_action = tf.concat( |
|
|
|
[ |
|
|
|
tf.layers.dense( |
|
|
|
hidden, self.act_size[i], activation=tf.nn.softmax, name="pred_action" |
|
|
|
hidden, |
|
|
|
self.act_size[i], |
|
|
|
activation=tf.nn.softmax, |
|
|
|
name="pred_action", |
|
|
|
) |
|
|
|
for i in range(len(self.act_size)) |
|
|
|
], |
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
def create_forward_model( |
|
|
|
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, forward_layers: int, |
|
|
|
var_predict: bool=False, separate_train: bool=False |
|
|
|
self, |
|
|
|
encoded_state: tf.Tensor, |
|
|
|
encoded_next_state: tf.Tensor, |
|
|
|
forward_layers: int, |
|
|
|
var_predict: bool = False, |
|
|
|
separate_train: bool = False, |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates forward model TensorFlow ops for Curiosity module. |
|
|
|
|
|
|
""" |
|
|
|
combined_input = tf.concat( |
|
|
|
[encoded_state, self.current_action], axis=1 |
|
|
|
) |
|
|
|
combined_input = tf.concat([encoded_state, self.current_action], axis=1) |
|
|
|
hidden = combined_input |
|
|
|
if separate_train: |
|
|
|
hidden = tf.stop_gradient(hidden) |
|
|
|
|
|
|
|
|
|
|
if var_predict: |
|
|
|
self.predict_distribution = GaussianEncoderDistribution( |
|
|
|
hidden, |
|
|
|
self.feature_size |
|
|
|
hidden, self.feature_size |
|
|
|
) |
|
|
|
self.predict = self.predict_distribution.sample() |
|
|
|
else: |
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
squared_difference = 0.5 * tf.reduce_sum( |
|
|
|
tf.squared_difference(self.predict, encoded_next_state), axis=1 |
|
|
|
tf.squared_difference(self.predict, tf.stop_gradient(encoded_next_state)), |
|
|
|
axis=1, |
|
|
|
|
|
|
|
self.forward_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(squared_difference, self.mask, 2)[1] |
|
|
|
) |
|
|
|
|
|
|
|
def create_reward_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, |
|
|
|
forward_layers: int, separate_train: bool=False): |
|
|
|
|
|
|
|
combined_input = tf.concat( |
|
|
|
[encoded_state, self.current_action], axis=1 |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
self.forward_loss = tf.reduce_mean(squared_difference) |
|
|
|
# tf.dynamic_partition(squared_difference, self.mask, 2)[1] |
|
|
|
# ) |
|
|
|
|
|
|
|
def create_reward_model( |
|
|
|
self, |
|
|
|
encoded_state: tf.Tensor, |
|
|
|
encoded_next_state: tf.Tensor, |
|
|
|
forward_layers: int, |
|
|
|
separate_train: bool = False, |
|
|
|
): |
|
|
|
|
|
|
|
combined_input = tf.concat([encoded_state, self.current_action], axis=1) |
|
|
|
|
|
|
|
hidden = combined_input |
|
|
|
if separate_train: |
|
|
|
hidden = tf.stop_gradient(hidden) |
|
|
|
|
|
|
self.h_size |
|
|
|
* (self.vis_obs_size + int(self.vec_obs_size > 0)), |
|
|
|
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)), |
|
|
|
name="hidden_{}".format(i), |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
# kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
|
|
|
# activation=ModelUtils.swish, |
|
|
|
# kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
self.reward_loss = tf.clip_by_value(tf.reduce_mean( |
|
|
|
tf.squared_difference(self.pred_reward, self.current_reward) |
|
|
|
), 1e-10,1.0) |
|
|
|
|
|
|
|
self.reward_loss = tf.clip_by_value( |
|
|
|
tf.reduce_mean( |
|
|
|
tf.squared_difference(self.pred_reward, self.current_reward) |
|
|
|
), |
|
|
|
1e-10, |
|
|
|
1.0, |
|
|
|
) |
|
|
|
|
|
|
|
self, |
|
|
|
self, |
|
|
|
h_size: int, |
|
|
|
feature_size: int, |
|
|
|
encoder_layers: int, |
|
|
|
|
|
|
predict_return: bool |
|
|
|
predict_return: bool, |
|
|
|
) -> None: |
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
self.visual_bisim = ModelUtils.create_visual_input_placeholders( |
|
|
|
|
|
|
if self.normalize: |
|
|
|
bi_normalization_tensors = self.create_target_normalizer(self.vector_bisim) |
|
|
|
bi_normalization_tensors = self.create_target_normalizer( |
|
|
|
self.vector_bisim |
|
|
|
) |
|
|
|
self.bi_update_normalization_op = bi_normalization_tensors.update_op |
|
|
|
self.bi_normalization_steps = bi_normalization_tensors.steps |
|
|
|
self.bi_running_mean = bi_normalization_tensors.running_mean |
|
|
|
|
|
|
h_size, |
|
|
|
encoder_layers, |
|
|
|
vis_encode_type, |
|
|
|
reuse=True |
|
|
|
reuse=True, |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
reuse=True |
|
|
|
) |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="latent", |
|
|
|
activation=ModelUtils.swish, |
|
|
|
kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
reuse=True, |
|
|
|
) |
|
|
|
combined_input = tf.concat( |
|
|
|
[self.bisim_encoder, self.bisim_action], axis=1 |
|
|
|
) |
|
|
|
combined_input = tf.concat([self.bisim_encoder, self.bisim_action], axis=1) |
|
|
|
combined_input = tf.stop_gradient(combined_input) |
|
|
|
|
|
|
|
with tf.variable_scope("predict"): |
|
|
|
|
|
|
|
|
|
|
if var_predict: |
|
|
|
self.bisim_predict_distribution = GaussianEncoderDistribution( |
|
|
|
hidden, |
|
|
|
self.feature_size, |
|
|
|
reuse=True |
|
|
|
hidden, self.feature_size, reuse=True |
|
|
|
) |
|
|
|
self.bisim_predict = self.predict_distribution.sample() |
|
|
|
else: |
|
|
|
|
|
|
for i in range(forward_layers): |
|
|
|
hidden = tf.layers.dense( |
|
|
|
hidden, |
|
|
|
self.h_size |
|
|
|
* (self.vis_obs_size + int(self.vec_obs_size > 0)), |
|
|
|
self.h_size * (self.vis_obs_size + int(self.vec_obs_size > 0)), |
|
|
|
name="hidden_{}".format(i), |
|
|
|
reuse=True |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
|
|
|
reuse=True |
|
|
|
# activation=ModelUtils.swish, |
|
|
|
# kernel_initializer=tf.initializers.variance_scaling(1.0), |
|
|
|
) |
|
|
|
) |