浏览代码

Merge remote-tracking branch 'upstream/develop' into develop-flat-code-restructure

/develop-generalizationTraining-TrainerController
Deric Pang 6 年前
当前提交
d4ca94a1
共有 3 个文件被更改,包括 26 次插入15 次删除
  1. 10
      MLAgentsSDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
  2. 4
      python/ml-agents/mlagents/trainers/bc/models.py
  3. 27
      python/ml-agents/mlagents/trainers/models.py

10
MLAgentsSDK/Assets/ML-Agents/Scripts/CoreBrainInternal.cs


bool hasMaskedActions;
bool hasValueEstimate;
float[,] inputState;
int[] inputPrevAction;
int[,] inputPrevAction;
List<float[,,,]> observationMatrixList;
float[,] inputOldMemories;
float[,] maskedActions;

// Create the state tensor
if (hasPrevAction)
{
inputPrevAction = new int[currentBatchSize];
int totalNumberActions = brain.brainParameters.vectorActionSize.Length;
inputPrevAction = new int[currentBatchSize, totalNumberActions];
inputPrevAction[i] = Mathf.FloorToInt(actionList[0]);
for (var j = 0 ; j < totalNumberActions; j++)
{
inputPrevAction[i,j] = Mathf.FloorToInt(actionList[j]);
}
i++;
}
}

4
python/ml-agents/mlagents/trainers/bc/models.py


self.action_probs = tf.concat(
[tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs")
self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
self.sample_action_float = self.create_discrete_action_masking_layer(
policy_branches, self.action_masks, self.a_size)
self.sample_action_float, _ = self.create_discrete_action_masking_layer(
tf.concat(policy_branches, axis=1), self.action_masks, self.a_size)
self.sample_action_float = tf.identity(self.sample_action_float, name="action")
self.sample_action = tf.cast(self.sample_action_float, tf.int32)
self.true_action = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action")

27
python/ml-agents/mlagents/trainers/models.py


return hidden_flat
@staticmethod
def create_discrete_action_masking_layer(branches_logits, action_masks, action_size):
def create_discrete_action_masking_layer(all_logits, action_masks, action_size):
:param branches_logits: A list of the unnormalized action probabilities fir each branch
:param all_logits: The concatenated unnormalized action probabilities for all branches
:return: The action output dimension [batch_size, num_branches]
:return: The action output dimension [batch_size, num_branches] and the concatenated normalized logits
branches_logits = [all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size))]
raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k])
raw_probs = [tf.multiply(tf.nn.softmax(branches_logits[k]), branch_masks[k]) + (1-branch_masks[k])*1.0e-10
return output
return output, tf.concat([tf.log(normalized_probs[k]) for k in range(len(action_size))], axis=1)
def create_observation_streams(self, num_streams, h_size, num_layers):
"""

self.all_log_probs = tf.concat([branch for branch in policy_branches], axis=1, name="action_probs")
self.action_masks = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name="action_masks")
output = self.create_discrete_action_masking_layer(policy_branches, self.action_masks, self.a_size)
output, normalized_logits = self.create_discrete_action_masking_layer(
self.all_log_probs, self.action_masks, self.a_size)
self.output = tf.identity(output, name="action")

self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder")
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder")
self.all_old_log_probs = tf.placeholder(shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities')
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.a_size)], dtype=tf.float32, name='old_probabilities')
_, old_normalized_logits = self.create_discrete_action_masking_layer(
self.all_old_log_probs, self.action_masks, self.a_size)
action_idx = [0] + list(np.cumsum(self.a_size))

self.log_probs = tf.reduce_sum((tf.stack([
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.selected_actions[:, action_idx[i]:action_idx[i + 1]],
logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]
logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]]
logits=self.all_old_log_probs[:, action_idx[i]:action_idx[i + 1]]
logits=old_normalized_logits[:, action_idx[i]:action_idx[i + 1]]
)
for i in range(len(self.a_size))], axis=1)), axis=1, keepdims=True)
正在加载...
取消
保存