浏览代码

Fix SAC CC and some reward signal tests

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
cadf6603
共有 7 个文件被更改,包括 77 次插入67 次删除
  1. 6
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 4
      ml-agents/mlagents/trainers/components/bc/model.py
  3. 49
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  4. 2
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  5. 19
      ml-agents/mlagents/trainers/sac/optimizer.py
  6. 62
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  7. 2
      ml-agents/mlagents/trainers/tf_policy.py

6
ml-agents/mlagents/trainers/common/nn_policy.py


:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param resample: Whether we are using the resampling trick to update the policy in continuous output.
"""
with tf.variable_scope("policy"):
with tf.variable_scope("policy/"):
super().__init__(seed, brain, trainer_params, load)
self.stats_name_to_update_name = {

h_size,
num_layers,
vis_encode_type,
stream_scopes=["policy"],
stream_scopes=["policy/"],
)[0]
if self.use_recurrent:

h_size,
num_layers,
vis_encode_type,
stream_scopes=["policy"],
stream_scopes=["policy/"],
)[0]
if self.use_recurrent:

4
ml-agents/mlagents/trainers/components/bc/model.py


else:
self.annealed_learning_rate = tf.Variable(learning_rate)
optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate)
optimizer = tf.train.AdamOptimizer(
learning_rate=self.annealed_learning_rate, name="bc_adam"
)
self.update_batch = optimizer.minimize(self.loss)

49
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


class CuriosityModel(object):
def __init__(
self,
policy_model: TFPolicy,
encoding_size: int = 128,
learning_rate: float = 3e-4,
self, policy: TFPolicy, encoding_size: int = 128, learning_rate: float = 3e-4
:param policy_model: The model being used by the learning policy
:param policy: The model being used by the learning policy
self.policy_model = policy_model
self.policy = policy
self.next_visual_in: List[tf.Tensor] = []
encoded_state, encoded_next_state = self.create_curiosity_encoders()
self.create_inverse_model(encoded_state, encoded_next_state)

encoded_state_list = []
encoded_next_state_list = []
if self.policy_model.vis_obs_size > 0:
if self.policy.vis_obs_size > 0:
for i in range(self.policy_model.vis_obs_size):
for i in range(self.policy.vis_obs_size):
self.policy_model.brain.camera_resolutions[i],
self.policy.brain.camera_resolutions[i],
name="curiosity_next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)

encoded_visual = LearningModel.create_visual_observation_encoder(
self.policy_model.visual_in[i],
self.policy.visual_in[i],
self.encoding_size,
LearningModel.swish,
1,

encoded_state_list.append(hidden_visual)
encoded_next_state_list.append(hidden_next_visual)
if self.policy_model.vec_obs_size > 0:
if self.policy.vec_obs_size > 0:
shape=[None, self.policy_model.vec_obs_size],
shape=[None, self.policy.vec_obs_size],
self.policy_model.vector_in,
self.policy.vector_in,
self.encoding_size,
LearningModel.swish,
2,

"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish)
if self.policy_model.brain.vector_action_space_type == "continuous":
if self.policy.brain.vector_action_space_type == "continuous":
hidden, self.policy_model.act_size[0], activation=None
hidden, self.policy.act_size[0], activation=None
tf.squared_difference(pred_action, self.policy_model.selected_actions),
axis=1,
tf.squared_difference(pred_action, self.policy.selected_actions), axis=1
tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]
tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
hidden, self.policy_model.act_size[i], activation=tf.nn.softmax
hidden, self.policy.act_size[i], activation=tf.nn.softmax
for i in range(len(self.policy_model.act_size))
for i in range(len(self.policy.act_size))
-tf.log(pred_action + 1e-10) * self.policy_model.selected_actions,
axis=1,
-tf.log(pred_action + 1e-10) * self.policy.selected_actions, axis=1
tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1]
tf.dynamic_partition(cross_entropy, self.policy.mask, 2)[1]
)
def create_forward_model(

:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat(
[encoded_state, self.policy_model.selected_actions], axis=1
[encoded_state, self.policy.selected_actions], axis=1
* (
self.policy_model.vis_obs_size + int(self.policy_model.vec_obs_size > 0)
),
* (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)),
activation=None,
)
squared_difference = 0.5 * tf.reduce_sum(

self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]
tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
)
def create_loss(self, learning_rate: float) -> None:

2
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


"""
feed_dict = {
policy.batch_size_ph: num_sequences,
policy.sequence_length: self.policy.sequence_length,
policy.sequence_length_ph: self.policy.sequence_length,
policy.mask_input: mini_batch["masks"],
}
if self.policy.use_continuous_act:

19
ml-agents/mlagents/trainers/sac/optimizer.py


* tf.to_float(self.policy.mask)
* tf.stop_gradient(
tf.reduce_sum(
branched_per_action_ent + self.target_entropy,
self.policy.all_log_probs + self.target_entropy,
axis=1,
keep_dims=True,
)

self.ent_coef * per_action_entropy - self.policy_network.q1_p, axis=1
self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
axis=1,
)
self.policy_loss = tf.reduce_mean(
tf.to_float(self.policy.mask) * batch_policy_loss

for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- tf.reduce_sum(self.ent_coef * per_action_entropy, axis=1)
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5

Creates the Adam optimizers and update ops for SAC, including
the policy, value, and entropy updates, as well as the target network update.
"""
policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
policy_optimizer = tf.train.AdamOptimizer(
learning_rate=self.learning_rate, name="sac_policy_opt"
)
entropy_optimizer = tf.train.AdamOptimizer(
learning_rate=self.learning_rate, name="sac_entropy_opt"
)
value_optimizer = tf.train.AdamOptimizer(
learning_rate=self.learning_rate, name="sac_value_opt"
)
self.target_update_op = [
tf.assign(target, (1 - self.tau) * target + self.tau * source)

62
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import os
import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.ppo.optimizer import PPOOptimizer
def ppo_dummy_config():

NUM_AGENTS = 12
def create_policy_mock(
def create_optimizer_mock(
trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
):
mock_brain = mb.setup_mock_brain(

trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["reward_signals"].update(reward_signal_config)
trainer_parameters["use_recurrent"] = use_rnn
if trainer_config["trainer"] == "ppo":
policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
if trainer_parameters["trainer"] == "sac":
optimizer = SACOptimizer(policy, trainer_parameters)
policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
return policy
optimizer = PPOOptimizer(policy, trainer_parameters)
return optimizer
def reward_signal_eval(policy, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, policy.brain)
def reward_signal_eval(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.brain)
rsig_result = policy.reward_signals[reward_signal_name].evaluate_batch(buffer)
rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
def reward_signal_update(policy, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
feed_dict = policy.reward_signals[reward_signal_name].prepare_update(
policy, buffer.make_mini_batch(0, 10), 2
def reward_signal_update(optimizer, reward_signal_name):
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
optimizer.policy, buffer.make_mini_batch(0, 10), 2
out = policy._execute_model(
feed_dict, policy.reward_signals[reward_signal_name].update_dict
out = optimizer.policy._execute_model(
feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
)
assert type(out) is dict

)
def test_gail_cc(trainer_config, gail_dummy_config):
policy = create_policy_mock(trainer_config, gail_dummy_config, False, False, False)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config, False, False, False
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(

gail_dummy_config["gail"]["demo_path"] = (
os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"
)
policy = create_policy_mock(trainer_config, gail_dummy_config, False, True, True)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")
optimizer = create_optimizer_mock(
trainer_config, gail_dummy_config, False, True, True
)
reward_signal_eval(optimizer, "gail")
reward_signal_update(optimizer, "gail")
@pytest.mark.parametrize(

policy = create_policy_mock(trainer_config, gail_dummy_config, True, False, False)
policy = create_optimizer_mock(
trainer_config, gail_dummy_config, True, False, False
)
reward_signal_eval(policy, "gail")
reward_signal_update(policy, "gail")

)
def test_curiosity_cc(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, False
)
reward_signal_eval(policy, "curiosity")

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_dc(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, True, False
)
reward_signal_eval(policy, "curiosity")

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_visual(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, True
)
reward_signal_eval(policy, "curiosity")

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_curiosity_rnn(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, True, False, False
)
reward_signal_eval(policy, "curiosity")

"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_extrinsic(trainer_config, curiosity_dummy_config):
policy = create_policy_mock(
policy = create_optimizer_mock(
trainer_config, curiosity_dummy_config, False, False, False
)
reward_signal_eval(policy, "extrinsic")

2
ml-agents/mlagents/trainers/tf_policy.py


self.running_variance: Optional[tf.Variable] = None
self.update_normalization_op: Optional[tf.Operation] = None
self.value: Optional[tf.Tensor] = None
self.all_log_probs: Optional[tf.Tensor] = None
self.all_log_probs: tf.Tensor = None
self.log_probs: Optional[tf.Tensor] = None
self.entropy: Optional[tf.Tensor] = None
self.action_oh: tf.Tensor = None

正在加载...
取消
保存