|
|
|
|
|
|
return hidden_flat |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def create_discrete_action_masking_layer(branches_logits, action_masks, action_size): |
|
|
|
def create_discrete_action_masking_layer(all_logits, action_masks, action_size): |
|
|
|
:param branches_logits: A list of the unnormalized action probabilities fir each branch |
|
|
|
:param all_logits: The concatenated unnormalized action probabilities for all branches |
|
|
|
:return: The action output dimension [batch_size, num_branches] |
|
|
|
:return: The action output dimension [batch_size, num_branches] and the concatenated normalized logits |
|
|
|
branches_logits = [all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size))] |
|
|
|
|
|
|
|
raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k]) |
|
|
|
raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k]) + (1-branch_masks[k])*1.0e-10 |
|
|
|
return output |
|
|
|
return output, tf.concat([tf.log(normalized_probs[k]) for k in range(len(action_size))], axis=1) |
|
|
|
|
|
|
|
def create_observation_streams(self, num_streams, h_size, num_layers): |
|
|
|
""" |
|
|
|
|
|
|
self.all_log_probs = tf.concat([branch for branch in policy_branches], axis=1, name="action_probs") |
|
|
|
|
|
|
|
self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks") |
|
|
|
output = self.create_discrete_action_masking_layer(policy_branches, self.action_masks, self.a_size) |
|
|
|
output, normalized_logits = self.create_discrete_action_masking_layer( |
|
|
|
self.all_log_probs, self.action_masks, self.a_size) |
|
|
|
|
|
|
|
self.output = tf.identity(output, name="action") |
|
|
|
|
|
|
|
|
|
|
self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") |
|
|
|
self.action_holder = tf.placeholder( |
|
|
|
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") |
|
|
|
self.all_old_log_probs = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities') |
|
|
|
self.all_old_log_probs = tf.placeholder( |
|
|
|
shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities') |
|
|
|
_, old_normalized_logits = self.create_discrete_action_masking_layer( |
|
|
|
self.all_old_log_probs, self.action_masks, self.a_size) |
|
|
|
|
|
|
|
action_idx = [0] + list(np.cumsum(self.a_size)) |
|
|
|
|
|
|
|
|
|
|
self.log_probs = tf.reduce_sum((tf.stack([ |
|
|
|
-tf.nn.softmax_cross_entropy_with_logits_v2( |
|
|
|
labels=self.selected_actions[:, action_idx[i]:action_idx[i + 1]], |
|
|
|
logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]] |
|
|
|
logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]] |
|
|
|
logits=self.all_old_log_probs[:, action_idx[i]:action_idx[i + 1]] |
|
|
|
logits=old_normalized_logits[:, action_idx[i]:action_idx[i + 1]] |
|
|
|
) |
|
|
|
for i in range(len(self.a_size))], axis=1)), axis=1, keepdims=True) |
|
|
|
|