浏览代码

[change] Remove the action_holder placeholder from the policy. (#3492)

/asymm-envs
GitHub 5 年前
当前提交
97a1d4b1
共有 7 个文件被更改,包括 17 次插入22 次删除
  1. 9
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 4
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  3. 4
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  4. 6
      ml-agents/mlagents/trainers/ppo/optimizer.py
  5. 8
      ml-agents/mlagents/trainers/sac/network.py
  6. 7
      ml-agents/mlagents/trainers/sac/optimizer.py
  7. 1
      ml-agents/mlagents/trainers/tf_policy.py

9
ml-agents/mlagents/trainers/common/nn_policy.py


(tf.identity(self.all_log_probs)), axis=1, keepdims=True
)
self.action_holder = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder"
)
def _create_dc_actor(self, encoded: tf.Tensor) -> None:
"""
Creates Discrete control actor-critic model.

self.output = tf.identity(output)
self.all_log_probs = tf.identity(normalized_logits, name="action")
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
)
tf.one_hot(self.action_holder[:, i], self.act_size[i])
tf.one_hot(self.output[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,

4
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
else:
feed_dict[self.policy.action_holder] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["actions"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
else:
feed_dict[policy.action_holder] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["actions"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]

4
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
else:
feed_dict[self.policy.action_holder] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
else:
feed_dict[policy.action_holder] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["actions"]
if self.policy.use_vis_obs > 0:
for i in range(len(policy.visual_in)):

6
ml-agents/mlagents/trainers/ppo/optimizer.py


self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control actor-critic model.
Creates Continuous control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.

self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control actor-critic model.
Creates Discrete control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.

if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.action_holder] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]

8
ml-agents/mlagents/trainers/sac/network.py


self.h_size,
self.join_scopes(scope, "value"),
)
self.external_action_in = tf.placeholder(
shape=[None, self.policy.act_size[0]],
dtype=tf.float32,
name="external_action_in",
)
hidden_q = tf.concat([hidden_value, self.policy.action_holder], axis=-1)
hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1)
hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
self.stream_names,

7
ml-agents/mlagents/trainers/sac/optimizer.py


self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
self.action_holder = self.policy.action_holder
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:

# Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference.
# Don't use value estimate during inference.
self.value = tf.identity(
self.policy_network.value, name="value_estimate_unused"
)

feed_dict[self.rewards_holders[name]] = batch["{}_rewards".format(name)]
if self.policy.use_continuous_act:
feed_dict[policy.action_holder] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[policy.action_holder] = batch["actions"]
feed_dict[policy.output] = batch["actions"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]

1
ml-agents/mlagents/trainers/tf_policy.py


self.output_pre: Optional[tf.Tensor] = None
self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.action_masks: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.memory_in: Optional[tf.Tensor] = None

正在加载...
取消
保存