|
|
|
|
|
|
# Create input ops for next (t+1) visual observations. |
|
|
|
visual_input = self.policy_model.create_visual_input( |
|
|
|
self.policy_model.brain.camera_resolutions[i], |
|
|
|
name="visual_observation_" + str(i), |
|
|
|
name="gail_visual_observation_" + str(i), |
|
|
|
) |
|
|
|
self.expert_visual_in.append(visual_input) |
|
|
|
|
|
|
|
|
|
|
LearningModel.swish, |
|
|
|
1, |
|
|
|
"stream_{}_visual_obs_encoder".format(i), |
|
|
|
"gail_stream_{}_visual_obs_encoder".format(i), |
|
|
|
False, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
LearningModel.swish, |
|
|
|
1, |
|
|
|
"stream_{}_visual_obs_encoder".format(i), |
|
|
|
"gail_stream_{}_visual_obs_encoder".format(i), |
|
|
|
True, |
|
|
|
) |
|
|
|
visual_policy_encoders.append(encoded_policy_visual) |
|
|
|
|
|
|
concat_input, |
|
|
|
self.h_size, |
|
|
|
activation=LearningModel.swish, |
|
|
|
name="d_hidden_1", |
|
|
|
name="gail_d_hidden_1", |
|
|
|
reuse=reuse, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
activation=LearningModel.swish, |
|
|
|
name="d_hidden_2", |
|
|
|
name="gail_d_hidden_2", |
|
|
|
reuse=reuse, |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
hidden_2, |
|
|
|
self.z_size, |
|
|
|
reuse=reuse, |
|
|
|
name="z_mean", |
|
|
|
name="gail_z_mean", |
|
|
|
kernel_initializer=LearningModel.scaled_init(0.01), |
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
estimate_input, |
|
|
|
1, |
|
|
|
activation=tf.nn.sigmoid, |
|
|
|
name="d_estimate", |
|
|
|
name="gail_d_estimate", |
|
|
|
reuse=reuse, |
|
|
|
) |
|
|
|
return estimate, z_mean, concat_input |
|
|
|
|
|
|
""" |
|
|
|
if self.use_vail: |
|
|
|
self.z_sigma = tf.get_variable( |
|
|
|
"sigma_vail", |
|
|
|
"gail_sigma_vail", |
|
|
|
self.z_size, |
|
|
|
dtype=tf.float32, |
|
|
|
initializer=tf.ones_initializer(), |
|
|
|
|
|
|
self.use_noise = tf.placeholder( |
|
|
|
shape=[1], dtype=tf.float32, name="NoiseLevel" |
|
|
|
shape=[1], dtype=tf.float32, name="gail_NoiseLevel" |
|
|
|
) |
|
|
|
self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( |
|
|
|
self.encoded_expert, self.expert_action, self.done_expert, reuse=False |
|
|
|
|
|
|
reuse=True, |
|
|
|
) |
|
|
|
self.discriminator_score = tf.reshape( |
|
|
|
self.policy_estimate, [-1], name="GAIL_reward" |
|
|
|
self.policy_estimate, [-1], name="gail_reward" |
|
|
|
) |
|
|
|
self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON) |
|
|
|
|
|
|
|