浏览代码

Fix BC and tests

/develop/nopreviousactions
Ervin Teng 4 年前
当前提交
cfc2f455
共有 7 个文件被更改,包括 86 次插入121 次删除
  1. 18
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 30
      ml-agents/mlagents/trainers/components/bc/model.py
  3. 4
      ml-agents/mlagents/trainers/components/bc/module.py
  4. 14
      ml-agents/mlagents/trainers/optimizer.py
  5. 4
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 5
      ml-agents/mlagents/trainers/sac/trainer.py
  7. 132
      ml-agents/mlagents/trainers/tests/test_bcmodule.py

18
ml-agents/mlagents/trainers/common/nn_policy.py


from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import LearningModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.components.bc.module import BCModule
logger = logging.getLogger("mlagents.trainers")

)
else:
self.create_dc_actor(h_size, num_layers, vis_encode_type)
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=3,
**trainer_params["behavioral_cloning"],
)
self.inference_dict: Dict[str, tf.Tensor] = {
"action": self.output,

# Stop gradient if we're not doing the resampling trick
if not resample:
sampled_policy = tf.stop_gradient(sampled_policy)
sampled_policy_probs = tf.stop_gradient(sampled_policy)
else:
sampled_policy_probs = sampled_policy
((sampled_policy - mu) / (sigma + EPSILON)) ** 2
((sampled_policy_probs - mu) / (sigma + EPSILON)) ** 2
+ 2 * log_sigma
+ np.log(2 * np.pi)
)

30
ml-agents/mlagents/trainers/components/bc/model.py


class BCModel(object):
def __init__(
self, policy_model: TFPolicy, learning_rate: float = 3e-4, anneal_steps: int = 0
self, policy: TFPolicy, learning_rate: float = 3e-4, anneal_steps: int = 0
:param policy_model: The policy of the learning algorithm
:param policy: The policy of the learning algorithm
self.policy_model = policy_model
self.expert_visual_in = self.policy_model.visual_in
self.obs_in_expert = self.policy_model.vector_in
self.policy = policy
self.expert_visual_in = self.policy.visual_in
self.obs_in_expert = self.policy.vector_in
self.make_inputs()
self.create_loss(learning_rate, anneal_steps)

self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
if self.policy_model.brain.vector_action_space_type == "continuous":
action_length = self.policy_model.act_size[0]
if self.policy.brain.vector_action_space_type == "continuous":
action_length = self.policy.act_size[0]
action_length = len(self.policy_model.act_size)
action_length = len(self.policy.act_size)
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.int32
)

for i, act_size in enumerate(self.policy_model.act_size)
for i, act_size in enumerate(self.policy.act_size)
],
axis=1,
)

:param learning_rate: The learning rate for the optimizer
:param anneal_steps: Number of steps over which to anneal the learning_rate
"""
selected_action = self.policy_model.output
if self.policy_model.brain.vector_action_space_type == "continuous":
selected_action = self.policy.output
if self.policy.use_continuous_act:
log_probs = self.policy_model.all_log_probs
log_probs = self.policy.all_log_probs
self.loss = tf.reduce_mean(
-tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
)

learning_rate,
self.policy_model.global_step,
anneal_steps,
0.0,
power=1.0,
learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0
)
else:
self.annealed_learning_rate = tf.Variable(learning_rate)

4
ml-agents/mlagents/trainers/components/bc/module.py


"""
feed_dict = {
self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length: self.policy.sequence_length,
self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
if not self.policy.use_continuous_act:

feed_dict[self.policy.memory_in] = np.zeros(
[self.n_sequences, self.policy.m_size], dtype=np.float32
)
if not self.policy.brain.vector_action_space_type == "continuous":
if not self.policy.use_continuous_act:
feed_dict[self.policy.prev_action] = mini_batch_demo["prev_action"]
network_out = self.policy.sess.run(
list(self.out_dict.values()), feed_dict=feed_dict

14
ml-agents/mlagents/trainers/optimizer.py


import abc
from typing import Dict, Any, List, Tuple
from typing import Dict, Any, List, Tuple, Optional
import numpy as np
from mlagents.tf_utils.tf import tf

from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
from mlagents.trainers.components.bc.module import BCModule
class Optimizer(abc.ABC):

self.memory_in: tf.Tensor = None
self.memory_out: tf.Tensor = None
self.m_size: int = 0
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "behavioral_cloning" in trainer_params:
BCModule.check_config(trainer_params["behavioral_cloning"])
self.bc_module = BCModule(
self.policy,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=3,
**trainer_params["behavioral_cloning"],
)
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool

4
ml-agents/mlagents/trainers/ppo/trainer.py


for stat, stat_list in batch_update_stats.items():
self.stats_reporter.add_stat(stat, np.mean(stat_list))
if self.policy.bc_module:
update_stats = self.policy.bc_module.update()
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self.stats_reporter.add_stat(stat, val)
self.clear_update_buffer()

5
ml-agents/mlagents/trainers/sac/trainer.py


for stat, stat_list in batch_update_stats.items():
self.stats_reporter.add_stat(stat, np.mean(stat_list))
bc_module = self.policy.bc_module
if bc_module:
update_stats = bc_module.update()
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self.stats_reporter.add_stat(stat, val)

132
ml-agents/mlagents/trainers/tests/test_bcmodule.py


import os
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.components.bc.module import BCModule
def ppo_dummy_config():

)
def sac_dummy_config():
return yaml.safe_load(
"""
trainer: sac
batch_size: 128
buffer_size: 50000
buffer_init_steps: 0
hidden_units: 128
init_entcoef: 1.0
learning_rate: 3.0e-4
max_steps: 5.0e4
memory_size: 256
normalize: false
num_update: 1
train_interval: 1
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
tau: 0.005
use_recurrent: false
vis_encode_type: simple
behavioral_cloning:
demo_path: ./Project/Assets/ML-Agents/Examples/Pyramids/Demos/ExpertPyramid.demo
strength: 1.0
steps: 10000000
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
def create_policy_with_bc_mock(mock_brain, trainer_config, use_rnn, demo_file):
def create_bc_module(mock_brain, trainer_config, use_rnn, demo_file, tanhresample):
# model_path = env.external_brain_names[0]
trainer_config["model_path"] = "testpath"
trainer_config["keep_checkpoints"] = 3

)
policy = (
NNPolicy(0, mock_brain, trainer_config, False, False)
if trainer_config["trainer"] == "ppo"
else SACPolicy(0, mock_brain, trainer_config, False, False)
policy = NNPolicy(
0, mock_brain, trainer_config, False, False, tanhresample, tanhresample
return policy
with policy.graph.as_default():
bc_module = BCModule(
policy,
policy_learning_rate=trainer_config["learning_rate"],
default_batch_size=trainer_config["batch_size"],
default_num_epoch=3,
**trainer_config["behavioral_cloning"],
)
policy.initialize_or_load()
return bc_module
# Test default values

trainer_config = ppo_dummy_config()
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
assert policy.bc_module.num_epoch == 3
assert policy.bc_module.batch_size == trainer_config["batch_size"]
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
assert bc_module.num_epoch == 3
assert bc_module.batch_size == trainer_config["batch_size"]
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
assert policy.bc_module.num_epoch == 100
assert policy.bc_module.batch_size == 10000
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", False)
assert bc_module.num_epoch == 100
assert bc_module.batch_size == 10000
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_bcmodule_update(trainer_config):
@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
def test_bcmodule_update(is_sac):
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
stats = policy.bc_module.update()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), False, "test.demo", is_sac
)
stats = bc_module.update()
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_bcmodule_constant_lr_update(trainer_config):
@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
def test_bcmodule_constant_lr_update(is_sac):
trainer_config = ppo_dummy_config()
policy = create_policy_with_bc_mock(mock_brain, trainer_config, False, "test.demo")
stats = policy.bc_module.update()
bc_module = create_bc_module(mock_brain, trainer_config, False, "test.demo", is_sac)
stats = bc_module.update()
old_learning_rate = policy.bc_module.current_lr
old_learning_rate = bc_module.current_lr
stats = policy.bc_module.update()
assert old_learning_rate == policy.bc_module.current_lr
stats = bc_module.update()
assert old_learning_rate == bc_module.current_lr
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_bcmodule_rnn_update(trainer_config):
@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
def test_bcmodule_rnn_update(is_sac):
policy = create_policy_with_bc_mock(mock_brain, trainer_config, True, "test.demo")
stats = policy.bc_module.update()
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), True, "test.demo", is_sac
)
stats = bc_module.update()
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_bcmodule_dc_visual_update(trainer_config):
@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
def test_bcmodule_dc_visual_update(is_sac):
policy = create_policy_with_bc_mock(
mock_brain, trainer_config, False, "testdcvis.demo"
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), False, "testdcvis.demo", is_sac
stats = policy.bc_module.update()
stats = bc_module.update()
@pytest.mark.parametrize(
"trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_bcmodule_rnn_dc_update(trainer_config):
@pytest.mark.parametrize("is_sac", [True, False], ids=["ppo", "sac"])
def test_bcmodule_rnn_dc_update(is_sac):
policy = create_policy_with_bc_mock(
mock_brain, trainer_config, True, "testdcvis.demo"
bc_module = create_bc_module(
mock_brain, ppo_dummy_config(), True, "testdcvis.demo", is_sac
stats = policy.bc_module.update()
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)

正在加载...
取消
保存