|
|
|
|
|
|
import os |
|
|
|
from typing import Any, Dict, Optional, List |
|
|
|
from typing import Any, Dict, Optional, List, Tuple |
|
|
|
from mlagents.tf_utils import tf |
|
|
|
from mlagents_envs.timers import timed |
|
|
|
from mlagents_envs.base_env import DecisionSteps |
|
|
|
|
|
|
import tf_slim as slim |
|
|
|
EPSILON = 1e-6 # Small value to avoid divide by zero |
|
|
|
|
|
|
|
class GaussianEncoderDistribution: |
|
|
|
def __init__( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
feature_size: int |
|
|
|
): |
|
|
|
self.mu = tf.layers.dense( |
|
|
|
encoded, |
|
|
|
feature_size, |
|
|
|
activation=None, |
|
|
|
name="mu", |
|
|
|
kernel_initializer=ModelUtils.scaled_init(0.01), |
|
|
|
reuse=tf.AUTO_REUSE, |
|
|
|
) |
|
|
|
|
|
|
|
self.log_sigma = tf.layers.dense( |
|
|
|
encoded, |
|
|
|
feature_size, |
|
|
|
activation=None, |
|
|
|
name="log_std", |
|
|
|
kernel_initializer=ModelUtils.scaled_init(0.01), |
|
|
|
) |
|
|
|
|
|
|
|
self.sigma = tf.exp(self.log_sigma) |
|
|
|
|
|
|
|
def sample(self): |
|
|
|
epsilon = tf.random_normal(tf.shape(self.mu)) |
|
|
|
sampled = self.mu + self.sigma * epsilon |
|
|
|
|
|
|
|
return sampled |
|
|
|
|
|
|
|
def kl_standard(self): |
|
|
|
""" |
|
|
|
KL divergence with a standard gaussian |
|
|
|
""" |
|
|
|
kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1) |
|
|
|
return kl |
|
|
|
|
|
|
|
|
|
|
|
class TransferPolicy(TFPolicy): |
|
|
|
def __init__( |
|
|
|
|
|
|
self.reparameterize = reparameterize |
|
|
|
self.condition_sigma_on_obs = condition_sigma_on_obs |
|
|
|
self.trainable_variables: List[tf.Variable] = [] |
|
|
|
self.encoder = None |
|
|
|
self.encoder_distribution = None |
|
|
|
self.targ_encoder = None |
|
|
|
self.separate_train = False # whether to train policy and model separately |
|
|
|
|
|
|
|
# Non-exposed parameters; these aren't exposed because they don't have a |
|
|
|
# good explanation and usually shouldn't be touched. |
|
|
|
|
|
|
""" |
|
|
|
return self.trainable_variables |
|
|
|
|
|
|
|
def create_tf_graph(self, transfer=False) -> None: |
|
|
|
def create_tf_graph(self, |
|
|
|
encoder_layers = 1, |
|
|
|
policy_layers = 1, |
|
|
|
policy_units = 128, |
|
|
|
transfer=False, |
|
|
|
separate_train=False, |
|
|
|
var_encoder=False, |
|
|
|
var_predict=False, |
|
|
|
predict_return=False, |
|
|
|
inverse_model=False |
|
|
|
) -> None: |
|
|
|
self.inverse_model = inverse_model |
|
|
|
with self.graph.as_default(): |
|
|
|
tf.set_random_seed(self.seed) |
|
|
|
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) |
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
self.create_input_placeholders() |
|
|
|
self.current_action = tf.placeholder( |
|
|
|
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action" |
|
|
|
) |
|
|
|
# latent feature encoder |
|
|
|
if transfer: |
|
|
|
n_layers = self.num_layers + 1 |
|
|
|
else: |
|
|
|
n_layers = self.num_layers |
|
|
|
self.encoder = self._create_encoder( |
|
|
|
self.visual_in, |
|
|
|
self.processed_vector_in, |
|
|
|
self.h_size, |
|
|
|
self.feature_size, |
|
|
|
n_layers, |
|
|
|
self.vis_encode_type |
|
|
|
) |
|
|
|
self.next_visual_in: List[tf.Tensor] = [] |
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
self.encoder, self.targ_encoder = self.create_encoders() |
|
|
|
with tf.variable_scope("inverse"): |
|
|
|
self.create_inverse_model(self.encoder, self.targ_encoder) |
|
|
|
with tf.variable_scope("predict"): |
|
|
|
self.create_forward_model(self.encoder, self.targ_encoder) |
|
|
|
self.targ_encoder = self._create_target_encoder( |
|
|
|
self.h_size, |
|
|
|
self.feature_size, |
|
|
|
n_layers, |
|
|
|
self.vis_encode_type |
|
|
|
) |
|
|
|
# if var_encoder: |
|
|
|
# self.encoder_distribution, self.encoder = self._create_var_encoder( |
|
|
|
# self.visual_in, |
|
|
|
# self.processed_vector_in, |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# encoder_layers, |
|
|
|
# self.vis_encode_type |
|
|
|
# ) |
|
|
|
self.hard_copy_encoder() |
|
|
|
# _, self.targ_encoder = self._create_var_target_encoder( |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# encoder_layers, |
|
|
|
# self.vis_encode_type |
|
|
|
# ) |
|
|
|
# else: |
|
|
|
# self.encoder = self._create_encoder( |
|
|
|
# self.visual_in, |
|
|
|
# self.processed_vector_in, |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# encoder_layers, |
|
|
|
# self.vis_encode_type |
|
|
|
# ) |
|
|
|
self.predict = self._create_world_model( |
|
|
|
self.encoder, |
|
|
|
self.h_size, |
|
|
|
self.feature_size, |
|
|
|
self.num_layers, |
|
|
|
self.vis_encode_type |
|
|
|
) |
|
|
|
# self.targ_encoder = self._create_target_encoder( |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# encoder_layers, |
|
|
|
# self.vis_encode_type |
|
|
|
# ) |
|
|
|
|
|
|
|
# self._create_hard_copy() |
|
|
|
|
|
|
|
# if var_predict: |
|
|
|
# self.predict_distribution, self.predict = self._create_var_world_model( |
|
|
|
# self.encoder, |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# self.num_layers, |
|
|
|
# self.vis_encode_type, |
|
|
|
# predict_return |
|
|
|
# ) |
|
|
|
# else: |
|
|
|
# self.predict = self._create_world_model( |
|
|
|
# self.encoder, |
|
|
|
# self.h_size, |
|
|
|
# self.feature_size, |
|
|
|
# self.num_layers, |
|
|
|
# self.vis_encode_type, |
|
|
|
# predict_return |
|
|
|
# ) |
|
|
|
|
|
|
|
# if inverse_model: |
|
|
|
# self._create_inverse_model(self.encoder, self.targ_encoder) |
|
|
|
self.h_size, |
|
|
|
self.num_layers, |
|
|
|
policy_units, |
|
|
|
policy_layers, |
|
|
|
separate_train |
|
|
|
self._create_dc_actor(self.encoder, self.h_size, self.num_layers) |
|
|
|
self._create_dc_actor(self.encoder, policy_units, policy_layers, separate_train) |
|
|
|
|
|
|
|
self.trainable_variables = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy" |
|
|
|
) |
|
|
|
|
|
|
self.trainable_variables += tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm" |
|
|
|
) # LSTMs need to be root scope for Barracuda export |
|
|
|
if not transfer: |
|
|
|
self.trainable_variables += tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse" |
|
|
|
) |
|
|
|
|
|
|
|
self.inference_dict: Dict[str, tf.Tensor] = { |
|
|
|
"action": self.output, |
|
|
|
|
|
|
# slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True) |
|
|
|
|
|
|
|
def load_graph_partial(self, path: str, transfer_type="dynamics"): |
|
|
|
load_nets = {"dynamics": ["policy", "predict", "value"], "observation": ["encoding"]} |
|
|
|
for net in load_nets[transfer_type]: |
|
|
|
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net) |
|
|
|
partial_saver = tf.train.Saver(variables_to_restore) |
|
|
|
partial_model_checkpoint = os.path.join(path, f"{net}.ckpt") |
|
|
|
partial_saver.restore(self.sess, partial_model_checkpoint) |
|
|
|
print("loaded net", net, "from path", path) |
|
|
|
|
|
|
|
load_nets = {"dynamics": ["policy", "predict", "value"], |
|
|
|
"observation": ["encoding", "inverse"]} |
|
|
|
if self.inverse_model: |
|
|
|
load_nets["dynamics"].append("inverse") |
|
|
|
with self.graph.as_default(): |
|
|
|
for net in load_nets[transfer_type]: |
|
|
|
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net) |
|
|
|
partial_saver = tf.train.Saver(variables_to_restore) |
|
|
|
partial_model_checkpoint = os.path.join(path, f"{net}.ckpt") |
|
|
|
partial_saver.restore(self.sess, partial_model_checkpoint) |
|
|
|
print("loaded net", net, "from path", path) |
|
|
|
|
|
|
|
|
|
|
|
self.hard_copy_encoder() |
|
|
|
self.run_hard_copy() |
|
|
|
|
|
|
|
def _create_world_model( |
|
|
|
self, |
|
|
|
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
predict_return: bool=False |
|
|
|
) -> tf.Tensor: |
|
|
|
"""" |
|
|
|
Builds the world model for state prediction |
|
|
|
|
|
|
scope=f"main_graph", |
|
|
|
reuse=False |
|
|
|
) |
|
|
|
predict = tf.layers.dense( |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="next_state" |
|
|
|
) |
|
|
|
if predict_return: |
|
|
|
predict = tf.layers.dense( |
|
|
|
hidden_stream, |
|
|
|
feature_size+1, |
|
|
|
name="next_state" |
|
|
|
) |
|
|
|
else: |
|
|
|
predict = tf.layers.dense( |
|
|
|
hidden_stream, |
|
|
|
feature_size, |
|
|
|
name="next_state" |
|
|
|
) |
|
|
|
def _create_var_world_model( |
|
|
|
self, |
|
|
|
encoder: tf.Tensor, |
|
|
|
h_size: int, |
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
predict_return: bool=False |
|
|
|
) -> tf.Tensor: |
|
|
|
"""" |
|
|
|
Builds the world model for state prediction |
|
|
|
""" |
|
|
|
with self.graph.as_default(): |
|
|
|
with tf.variable_scope("predict"): |
|
|
|
|
|
|
|
hidden_stream = ModelUtils.create_vector_observation_encoder( |
|
|
|
tf.concat([encoder, self.current_action], axis=1), |
|
|
|
h_size, |
|
|
|
ModelUtils.swish, |
|
|
|
num_layers, |
|
|
|
scope=f"main_graph", |
|
|
|
reuse=False |
|
|
|
) |
|
|
|
with tf.variable_scope("latent"): |
|
|
|
if predict_return: |
|
|
|
predict_distribution = GaussianEncoderDistribution( |
|
|
|
hidden_stream, |
|
|
|
feature_size+1 |
|
|
|
) |
|
|
|
else: |
|
|
|
predict_distribution = GaussianEncoderDistribution( |
|
|
|
hidden_stream, |
|
|
|
feature_size |
|
|
|
) |
|
|
|
|
|
|
|
predict = predict_distribution.sample() |
|
|
|
return predict_distribution, predict |
|
|
|
|
|
|
|
@timed |
|
|
|
def evaluate( |
|
|
|
self, decision_requests: DecisionSteps, global_agent_ids: List[str] |
|
|
|
|
|
|
""" |
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
hidden_stream = ModelUtils.create_observation_streams( |
|
|
|
self.visual_in, |
|
|
|
self.processed_vector_in, |
|
|
|
visual_in, |
|
|
|
vector_in, |
|
|
|
1, |
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
|
|
|
) |
|
|
|
return latent |
|
|
|
|
|
|
|
def hard_copy_encoder(self): |
|
|
|
def _create_var_target_encoder( |
|
|
|
self, |
|
|
|
h_size: int, |
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType, |
|
|
|
) -> tf.Tensor: |
|
|
|
self.visual_next = ModelUtils.create_visual_input_placeholders( |
|
|
|
self.brain.camera_resolutions |
|
|
|
) |
|
|
|
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size) |
|
|
|
if self.normalize: |
|
|
|
self.processed_vector_next = ModelUtils.normalize_vector_obs( |
|
|
|
self.vector_next, |
|
|
|
self.running_mean, |
|
|
|
self.running_variance, |
|
|
|
self.normalization_steps, |
|
|
|
) |
|
|
|
else: |
|
|
|
self.processed_vector_next = self.vector_next |
|
|
|
|
|
|
|
with tf.variable_scope("target_enc"): |
|
|
|
hidden_stream_targ = ModelUtils.create_observation_streams( |
|
|
|
self.visual_next, |
|
|
|
self.processed_vector_next, |
|
|
|
1, |
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
)[0] |
|
|
|
|
|
|
|
with tf.variable_scope("latent"): |
|
|
|
latent_targ_distribution = GaussianEncoderDistribution( |
|
|
|
hidden_stream_targ, |
|
|
|
feature_size |
|
|
|
) |
|
|
|
|
|
|
|
latent_targ = latent_targ_distribution.sample() |
|
|
|
|
|
|
|
return latent_targ_distribution, tf.stop_gradient(latent_targ) |
|
|
|
|
|
|
|
def _create_var_encoder( |
|
|
|
self, |
|
|
|
visual_in: List[tf.Tensor], |
|
|
|
vector_in: tf.Tensor, |
|
|
|
h_size: int, |
|
|
|
feature_size: int, |
|
|
|
num_layers: int, |
|
|
|
vis_encode_type: EncoderType |
|
|
|
) -> tf.Tensor: |
|
|
|
""" |
|
|
|
Creates a variational encoder for visual and vector observations. |
|
|
|
:param h_size: Size of hidden linear layers. |
|
|
|
:param num_layers: Number of hidden linear layers. |
|
|
|
:param vis_encode_type: Type of visual encoder to use if visual input. |
|
|
|
:return: The hidden layer (tf.Tensor) after the encoder. |
|
|
|
""" |
|
|
|
|
|
|
|
with tf.variable_scope("encoding"): |
|
|
|
hidden_stream = ModelUtils.create_observation_streams( |
|
|
|
visual_in, |
|
|
|
vector_in, |
|
|
|
1, |
|
|
|
h_size, |
|
|
|
num_layers, |
|
|
|
vis_encode_type, |
|
|
|
)[0] |
|
|
|
|
|
|
|
with tf.variable_scope("latent"): |
|
|
|
latent_distribution = GaussianEncoderDistribution( |
|
|
|
hidden_stream, |
|
|
|
feature_size |
|
|
|
) |
|
|
|
|
|
|
|
latent = latent_distribution.sample() |
|
|
|
|
|
|
|
return latent_distribution, latent |
|
|
|
|
|
|
|
def _create_hard_copy(self): |
|
|
|
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc') |
|
|
|
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding') |
|
|
|
|
|
|
|
|
|
|
def run_hard_copy(self): |
|
|
|
self.sess.run(self.target_replace_op) |
|
|
|
|
|
|
|
def _create_inverse_model( |
|
|
|
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates inverse model TensorFlow ops for Curiosity module. |
|
|
|
Predicts action taken given current and future encoded states. |
|
|
|
:param encoded_state: Tensor corresponding to encoded current state. |
|
|
|
:param encoded_next_state: Tensor corresponding to encoded next state. |
|
|
|
""" |
|
|
|
with tf.variable_scope("inverse"): |
|
|
|
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) |
|
|
|
hidden = tf.layers.dense(combined_input, self.h_size, activation=ModelUtils.swish) |
|
|
|
if self.brain.vector_action_space_type == "continuous": |
|
|
|
pred_action = tf.layers.dense( |
|
|
|
hidden, self.act_size[0], activation=None |
|
|
|
) |
|
|
|
squared_difference = tf.reduce_sum( |
|
|
|
tf.squared_difference(pred_action, self.current_action), axis=1 |
|
|
|
) |
|
|
|
self.inverse_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(squared_difference, self.mask, 2)[1] |
|
|
|
) |
|
|
|
else: |
|
|
|
pred_action = tf.concat( |
|
|
|
[ |
|
|
|
tf.layers.dense( |
|
|
|
hidden, self.act_size[i], activation=tf.nn.softmax |
|
|
|
) |
|
|
|
for i in range(len(self.act_size)) |
|
|
|
], |
|
|
|
axis=1, |
|
|
|
) |
|
|
|
cross_entropy = tf.reduce_sum( |
|
|
|
-tf.log(pred_action + 1e-10) * self.current_action, axis=1 |
|
|
|
) |
|
|
|
self.inverse_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(cross_entropy, self.mask, 2)[1] |
|
|
|
) |
|
|
|
|
|
|
|
def _create_cc_actor( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
|
|
|
reparameterize: bool = False, |
|
|
|
condition_sigma_on_obs: bool = True, |
|
|
|
separate_train: bool = False |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates Continuous control actor-critic model. |
|
|
|
|
|
|
else: |
|
|
|
hidden_policy = encoded |
|
|
|
|
|
|
|
if self.separate_train: |
|
|
|
if separate_train: |
|
|
|
hidden_policy = tf.stop_gradient(hidden_policy) |
|
|
|
|
|
|
|
with tf.variable_scope("policy"): |
|
|
|
|
|
|
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. |
|
|
|
self.total_log_probs = distribution.total_log_probs |
|
|
|
|
|
|
|
def _create_dc_actor(self, encoded: tf.Tensor, h_size: int, num_layers: int) -> None: |
|
|
|
def _create_dc_actor( |
|
|
|
self, |
|
|
|
encoded: tf.Tensor, |
|
|
|
h_size: int, |
|
|
|
num_layers: int, |
|
|
|
separate_train: bool = False |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates Discrete control actor-critic model. |
|
|
|
:param h_size: Size of hidden linear layers. |
|
|
|
|
|
|
else: |
|
|
|
hidden_policy = encoded |
|
|
|
|
|
|
|
if self.separate_train: |
|
|
|
if separate_train: |
|
|
|
hidden_policy = tf.stop_gradient(hidden_policy) |
|
|
|
self.action_masks = tf.placeholder( |
|
|
|
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" |
|
|
|
|
|
|
encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt") |
|
|
|
encoding_saver.save(self.sess, encoding_checkpoint) |
|
|
|
|
|
|
|
latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent") |
|
|
|
latent_saver = tf.train.Saver(latent_vars) |
|
|
|
latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt") |
|
|
|
latent_saver.save(self.sess, latent_checkpoint) |
|
|
|
# latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent") |
|
|
|
# latent_saver = tf.train.Saver(latent_vars) |
|
|
|
# latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt") |
|
|
|
# latent_saver.save(self.sess, latent_checkpoint) |
|
|
|
|
|
|
|
predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") |
|
|
|
predict_saver = tf.train.Saver(predict_vars) |
|
|
|
|
|
|
value_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") |
|
|
|
value_saver = tf.train.Saver(value_vars) |
|
|
|
value_checkpoint = os.path.join(self.model_path, f"value.ckpt") |
|
|
|
value_saver.save(self.sess, value_checkpoint) |
|
|
|
value_saver.save(self.sess, value_checkpoint) |
|
|
|
|
|
|
|
if self.inverse_model: |
|
|
|
inverse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") |
|
|
|
inverse_saver = tf.train.Saver(inverse_vars) |
|
|
|
inverse_checkpoint = os.path.join(self.model_path, f"inverse.ckpt") |
|
|
|
inverse_saver.save(self.sess, inverse_checkpoint) |
|
|
|
|
|
|
|
def get_encoder_weights(self): |
|
|
|
with self.graph.as_default(): |
|
|
|
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/main_graph_0/hidden_0/bias:0") |
|
|
|
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/main_graph_0/hidden_0/bias:0") |
|
|
|
print("encoding:", self.sess.run(enc)) |
|
|
|
print("target:", self.sess.run(targ)) |
|
|
|
|
|
|
|
def create_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: |
|
|
|
encoded_state_list = [] |
|
|
|
encoded_next_state_list = [] |
|
|
|
|
|
|
|
if self.vis_obs_size > 0: |
|
|
|
self.next_visual_in = [] |
|
|
|
visual_encoders = [] |
|
|
|
next_visual_encoders = [] |
|
|
|
for i in range(self.vis_obs_size): |
|
|
|
# Create input ops for next (t+1) visual observations. |
|
|
|
next_visual_input = ModelUtils.create_visual_input( |
|
|
|
self.brain.camera_resolutions[i], |
|
|
|
name="curiosity_next_visual_observation_" + str(i), |
|
|
|
) |
|
|
|
self.next_visual_in.append(next_visual_input) |
|
|
|
|
|
|
|
# Create the encoder ops for current and next visual input. |
|
|
|
# Note that these encoders are siamese. |
|
|
|
encoded_visual = ModelUtils.create_visual_observation_encoder( |
|
|
|
self.visual_in[i], |
|
|
|
self.h_size, |
|
|
|
ModelUtils.swish, |
|
|
|
self.num_layers, |
|
|
|
"curiosity_stream_{}_visual_obs_encoder".format(i), |
|
|
|
False, |
|
|
|
) |
|
|
|
|
|
|
|
encoded_next_visual = ModelUtils.create_visual_observation_encoder( |
|
|
|
self.next_visual_in[i], |
|
|
|
self.h_size, |
|
|
|
ModelUtils.swish, |
|
|
|
self.num_layers, |
|
|
|
"curiosity_stream_{}_visual_obs_encoder".format(i), |
|
|
|
True, |
|
|
|
) |
|
|
|
visual_encoders.append(encoded_visual) |
|
|
|
next_visual_encoders.append(encoded_next_visual) |
|
|
|
|
|
|
|
hidden_visual = tf.concat(visual_encoders, axis=1) |
|
|
|
hidden_next_visual = tf.concat(next_visual_encoders, axis=1) |
|
|
|
encoded_state_list.append(hidden_visual) |
|
|
|
encoded_next_state_list.append(hidden_next_visual) |
|
|
|
|
|
|
|
if self.vec_obs_size > 0: |
|
|
|
# Create the encoder ops for current and next vector input. |
|
|
|
# Note that these encoders are siamese. |
|
|
|
# Create input op for next (t+1) vector observation. |
|
|
|
self.next_vector_in = tf.placeholder( |
|
|
|
shape=[None, self.vec_obs_size], |
|
|
|
dtype=tf.float32, |
|
|
|
name="curiosity_next_vector_observation", |
|
|
|
) |
|
|
|
|
|
|
|
encoded_vector_obs = ModelUtils.create_vector_observation_encoder( |
|
|
|
self.vector_in, |
|
|
|
self.h_size, |
|
|
|
ModelUtils.swish, |
|
|
|
self.num_layers, |
|
|
|
"curiosity_vector_obs_encoder", |
|
|
|
False, |
|
|
|
) |
|
|
|
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder( |
|
|
|
self.next_vector_in, |
|
|
|
self.h_size, |
|
|
|
ModelUtils.swish, |
|
|
|
self.num_layers, |
|
|
|
"curiosity_vector_obs_encoder", |
|
|
|
True, |
|
|
|
) |
|
|
|
encoded_state_list.append(encoded_vector_obs) |
|
|
|
encoded_next_state_list.append(encoded_next_vector_obs) |
|
|
|
|
|
|
|
encoded_state = tf.concat(encoded_state_list, axis=1) |
|
|
|
encoded_next_state = tf.concat(encoded_next_state_list, axis=1) |
|
|
|
|
|
|
|
encoded_state = tf.layers.dense( |
|
|
|
encoded_state, |
|
|
|
self.feature_size, |
|
|
|
name="latent" |
|
|
|
) |
|
|
|
encoded_next_state = tf.layers.dense( |
|
|
|
encoded_next_state, |
|
|
|
self.feature_size, |
|
|
|
name="latent", |
|
|
|
reuse=True |
|
|
|
) |
|
|
|
|
|
|
|
return encoded_state, encoded_next_state |
|
|
|
|
|
|
|
def create_inverse_model( |
|
|
|
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates inverse model TensorFlow ops for Curiosity module. |
|
|
|
Predicts action taken given current and future encoded states. |
|
|
|
:param encoded_state: Tensor corresponding to encoded current state. |
|
|
|
:param encoded_next_state: Tensor corresponding to encoded next state. |
|
|
|
""" |
|
|
|
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) |
|
|
|
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) |
|
|
|
if self.brain.vector_action_space_type == "continuous": |
|
|
|
pred_action = tf.layers.dense( |
|
|
|
combined_input, self.act_size[0], activation=None |
|
|
|
) |
|
|
|
squared_difference = tf.reduce_sum( |
|
|
|
tf.squared_difference(pred_action, self.current_action), axis=1 |
|
|
|
) |
|
|
|
self.inverse_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(squared_difference, self.mask, 2)[1] |
|
|
|
) |
|
|
|
else: |
|
|
|
pred_action = tf.concat( |
|
|
|
[ |
|
|
|
tf.layers.dense( |
|
|
|
combined_input, self.act_size[i], activation=tf.nn.softmax |
|
|
|
) |
|
|
|
for i in range(len(self.act_size)) |
|
|
|
], |
|
|
|
axis=1, |
|
|
|
) |
|
|
|
cross_entropy = tf.reduce_sum( |
|
|
|
-tf.log(pred_action + 1e-10) * self.current_action, axis=1 |
|
|
|
) |
|
|
|
self.inverse_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(cross_entropy, self.mask, 2)[1] |
|
|
|
) |
|
|
|
|
|
|
|
def create_forward_model( |
|
|
|
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Creates forward model TensorFlow ops for Curiosity module. |
|
|
|
Predicts encoded future state based on encoded current state and given action. |
|
|
|
:param encoded_state: Tensor corresponding to encoded current state. |
|
|
|
:param encoded_next_state: Tensor corresponding to encoded next state. |
|
|
|
""" |
|
|
|
combined_input = tf.concat( |
|
|
|
[encoded_state, self.current_action], axis=1 |
|
|
|
) |
|
|
|
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) |
|
|
|
predict = tf.layers.dense( |
|
|
|
combined_input, |
|
|
|
self.h_size |
|
|
|
* (self.vis_obs_size + int(self.vec_obs_size > 0)), |
|
|
|
activation=None, |
|
|
|
) |
|
|
|
self.predict = tf.layers.dense( |
|
|
|
predict, |
|
|
|
self.feature_size, |
|
|
|
name="latent" |
|
|
|
) |
|
|
|
squared_difference = 0.5 * tf.reduce_sum( |
|
|
|
tf.squared_difference(self.predict, encoded_next_state), axis=1 |
|
|
|
) |
|
|
|
self.intrinsic_reward = squared_difference |
|
|
|
self.forward_loss = tf.reduce_mean( |
|
|
|
tf.dynamic_partition(squared_difference, self.mask, 2)[1] |
|
|
|
) |