|
|
|
|
|
|
self.train_model = hyperparameters.train_model |
|
|
|
self.train_policy = hyperparameters.train_policy |
|
|
|
self.train_value = hyperparameters.train_value |
|
|
|
|
|
|
|
|
|
|
|
self.transfer_path = ( |
|
|
|
hyperparameters.transfer_path |
|
|
|
) |
|
|
|
self.transfer_path = hyperparameters.transfer_path |
|
|
|
self.reward_update_dict: Dict[str, tf.Tensor] = {} |
|
|
|
self.model_only_update_dict: Dict[str, tf.Tensor] = {} |
|
|
|
self.bisim_update_dict: Dict[str, tf.Tensor] = {} |
|
|
|
|
|
|
|
|
|
|
"value_loss": self.value_loss, |
|
|
|
"policy_loss": self.abs_policy_loss, |
|
|
|
"model_loss": self.model_loss, |
|
|
|
"reward_loss": self.reward_loss, |
|
|
|
"update_batch": self.update_batch, |
|
|
|
"learning_rate": self.learning_rate, |
|
|
|
"decay_epsilon": self.decay_epsilon, |
|
|
|
|
|
|
# self.model_loss += self.policy.predict_distribution.kl_standard() |
|
|
|
|
|
|
|
self.model_loss = self.policy.forward_loss |
|
|
|
if self.predict_return: |
|
|
|
self.model_loss += 0.5 * self.policy.reward_loss |
|
|
|
self.reward_loss = self.policy.reward_loss |
|
|
|
# if self.predict_return: |
|
|
|
# self.model_loss += 0.5 * self.policy.reward_loss |
|
|
|
if self.with_prior: |
|
|
|
if self.use_var_encoder: |
|
|
|
self.model_loss += 0.2 * self.policy.encoder_distribution.kl_standard() |
|
|
|
|
|
|
def _create_ppo_optimizer_ops(self): |
|
|
|
train_vars = [] |
|
|
|
if self.train_encoder: |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding") |
|
|
|
train_vars += tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding" |
|
|
|
) |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc") |
|
|
|
train_vars += tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc" |
|
|
|
) |
|
|
|
if self.train_model: |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") |
|
|
|
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "encoding" |
|
|
|
) |
|
|
|
if self.train_action: |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc") |
|
|
|
train_vars += tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "action_enc" |
|
|
|
) |
|
|
|
if self.train_model: |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") |
|
|
|
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward") |
|
|
|
|
|
|
self.model_update_batch = self.model_optimizer.minimize( |
|
|
|
self.model_loss, var_list=train_vars |
|
|
|
) |
|
|
|
self.reward_update_batch = self.model_optimizer.minimize( |
|
|
|
self.reward_loss, var_list=train_vars |
|
|
|
) |
|
|
|
|
|
|
|
model_train_vars = tf.get_collection( |
|
|
|
tf.GraphKeys.TRAINABLE_VARIABLES, "predict" |
|
|
|
|
|
|
"model_loss": self.model_loss, |
|
|
|
"update_batch": self.model_update_batch, |
|
|
|
"model_learning_rate": self.model_learning_rate, |
|
|
|
"decay_epsilon": self.decay_epsilon, |
|
|
|
"decay_beta": self.decay_beta, |
|
|
|
} |
|
|
|
) |
|
|
|
self.reward_update_dict.update( |
|
|
|
{ |
|
|
|
"reward_loss": self.reward_loss, |
|
|
|
"reward_update_batch": self.reward_update_batch, |
|
|
|
|
|
|
|
self.model_only_update_dict.update( |
|
|
|
{ |
|
|
|
"model_loss": self.model_loss, |
|
|
|
|
|
|
|
|
|
|
elif self.in_batch_alter: |
|
|
|
update_vals = self._execute_model(feed_dict, self.model_update_dict) |
|
|
|
update_vals.update(self._execute_model(feed_dict, self.reward_update_dict)) |
|
|
|
# print(self._execute_model(feed_dict, {"pred": self.policy.predict, "enc": self.policy.next_state})) |
|
|
|
if self.use_bisim: |
|
|
|
batch1 = copy.deepcopy(batch) |
|
|
|
batch.shuffle(sequence_length=1) |
|
|
|