浏览代码

Switch to tanh squash in PPO

/develop/tanhsquash
Ervin Teng 5 年前
当前提交
3a4fa244
共有 7 个文件被更改,包括 15 次插入17 次删除
  1. 20
      ml-agents/mlagents/trainers/ppo/models.py
  2. 2
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  3. 3
      ml-agents/mlagents/trainers/ppo/policy.py
  4. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 2
      ml-agents/mlagents/trainers/sac/policy.py
  6. 1
      ml-agents/mlagents/trainers/tests/mock_brain.py
  7. 2
      ml-agents/mlagents/trainers/tests/test_sac.py

20
ml-agents/mlagents/trainers/ppo/models.py


logger = logging.getLogger("mlagents.trainers")
EPSILON = 1e-6 # Small value to avoid divide by zero
class PPOModel(LearningModel):
def __init__(

self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
# Clip and scale output to ensure actions are always within [-1, 1] range.
self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(output_post)
_policy_out = mu + tf.sqrt(sigma_sq) * self.epsilon
# Squash using tanh to ensure actions are always within [-1, 1] range.
self.output_pre = tf.tanh(_policy_out)
self.output = tf.identity(self.output_pre, name="action")
self.selected_actions = tf.stop_gradient(self.output)
-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
-0.5 * tf.square(tf.stop_gradient(_policy_out) - mu) / sigma_sq
)
# Correct for tanh squash (source: https://arxiv.org/abs/1801.01290)
all_probs -= tf.reduce_sum(
tf.log(1 - self.output_pre ** 2 + EPSILON), axis=1, keepdims=True
)
self.all_log_probs = tf.identity(all_probs, name="action_probs")

2
ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py


"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
if (

3
ml-agents/mlagents/trainers/ppo/policy.py


"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out

]
if self.use_continuous_act:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
else:
feed_dict[model.action_holder] = mini_batch["actions"]

2
ml-agents/mlagents/trainers/ppo/trainer.py


"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
epsilons = take_action_outputs["random_normal_epsilon"]
self.training_buffer[agent_id]["random_normal_epsilon"].append(
epsilons[agent_idx]

2
ml-agents/mlagents/trainers/sac/policy.py


"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out

1
ml-agents/mlagents/trainers/tests/mock_brain.py


)
else:
buffer[0]["action_probs"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["random_normal_epsilon"].append(
np.ones(buffer[0]["actions"][0].shape)
)

2
ml-agents/mlagents/trainers/tests/test_sac.py


env,
policy,
BUFFER_INIT_SAMPLES,
exclude_key_list=["advantages", "actions_pre", "random_normal_epsilon"],
exclude_key_list=["advantages", "random_normal_epsilon"],
)
# Mock out reward signal eval

正在加载...
取消
保存