浏览代码

Clean up value head creation

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
28f7608f
共有 4 个文件被更改,包括 14 次插入21 次删除
  1. 9
      ml-agents/mlagents/trainers/models.py
  2. 14
      ml-agents/mlagents/trainers/optimizer.py
  3. 8
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 4
      ml-agents/mlagents/trainers/sac/optimizer.py

9
ml-agents/mlagents/trainers/models.py


recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
return recurrent_output, tf.concat([lstm_state_out.c, lstm_state_out.h], axis=1)
def create_value_heads(self, stream_names, hidden_input):
@staticmethod
def create_value_heads(stream_names, hidden_input):
"""
Creates one value estimator head for each reward signal in stream_names.
Also creates the node corresponding to the mean of all the value heads in self.value.

of the hidden input.
"""
value_heads = {}
self.value_heads[name] = value
self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
value_heads[name] = value
value = tf.reduce_mean(list(value_heads.values()), 0)
return value_heads, value

14
ml-agents/mlagents/trainers/optimizer.py


)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def create_value_heads(self, stream_names, hidden_input):
"""
Creates one value estimator head for each reward signal in stream_names.
Also creates the node corresponding to the mean of all the value heads in self.value.
self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
:param stream_names: The list of reward signal names
:param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
of the hidden input.
"""
for name in stream_names:
value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
self.value_heads[name] = value
self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
def _execute_model(self, feed_dict, out_dict):
"""
Executes model.

8
ml-agents/mlagents/trainers/ppo/optimizer.py


else:
hidden_value = hidden_stream
self.create_value_heads(self.stream_names, hidden_value)
self.value_heads, self.value = LearningModel.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, 1], dtype=tf.float32, name="old_probabilities"
)

else:
hidden_value = hidden_stream
self.create_value_heads(self.stream_names, hidden_value)
self.value_heads, self.value = LearningModel.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],

4
ml-agents/mlagents/trainers/sac/optimizer.py


self.policy = policy
self.act_size = self.policy.act_size
h_size = int(trainer_params["hidden_units"])
max_step = int(trainer_params["max_steps"])
max_step = float(trainer_params["max_steps"])
num_layers = int(trainer_params["num_layers"])
vis_encode_type = EncoderType(
trainer_params.get("vis_encode_type", "simple")

)
self.create_inputs_and_outputs()
self.learning_rate = LearningModel.create_learning_rate(
lr_schedule, lr, self.policy.global_step, max_step
lr_schedule, lr, self.policy.global_step, int(max_step)
)
self.create_losses(
self.policy_network.q1_heads,

正在加载...
取消
保存