浏览代码

Fix ghost trainer and all tests

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
48b39b80
共有 10 个文件被更改,包括 129 次插入380 次删除
  1. 9
      ml-agents/mlagents/trainers/ghost/trainer.py
  2. 3
      ml-agents/mlagents/trainers/tests/mock_brain.py
  3. 1
      ml-agents/mlagents/trainers/tests/test_bcmodule.py
  4. 2
      ml-agents/mlagents/trainers/tests/test_ghost.py
  5. 14
      ml-agents/mlagents/trainers/tests/test_policy.py
  6. 91
      ml-agents/mlagents/trainers/tests/test_ppo.py
  7. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  8. 255
      ml-agents/mlagents/trainers/tests/test_sac.py
  9. 7
      ml-agents/mlagents/trainers/tf_policy.py
  10. 123
      ml-agents/mlagents/trainers/tests/test_multigpu.py

9
ml-agents/mlagents/trainers/ghost/trainer.py


return self.trainer.create_policy(brain_parameters)
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
# for saving/swapping snapshots
policy.init_load_weights()
self.policies[name_behavior_id] = policy
# First policy encountered

self._save_snapshot(policy)
self._save_snapshot(policy) # Need to save after trainer initializes policy
else:
# Normally Optimizer initializes policy. Do it here instead.
policy.create_tf_graph()
# for saving/swapping snapshots
policy.init_load_weights()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
return self.policies[name_behavior_id]

3
ml-agents/mlagents/trainers/tests/mock_brain.py


done = False
if is_discrete:
action_size = len(action_space)
action_probs = np.ones(np.sum(action_space), dtype=np.float32)
action_probs = np.ones((1), dtype=np.float32)
action_probs = np.ones(action_size, dtype=np.float32)
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[[False for _ in range(branch)] for branch in action_space]

1
ml-agents/mlagents/trainers/tests/test_bcmodule.py


default_num_epoch=3,
**trainer_config["behavioral_cloning"],
)
policy.initialize_or_load() # Normally the optimizer calls this after the BCModule is created
return bc_module

2
ml-agents/mlagents/trainers/tests/test_ghost.py


)
trainer.seed = 1
policy = trainer.create_policy(mock_brain)
policy.create_tf_graph()
to_load_policy.create_tf_graph()
to_load_policy.init_load_weights()
weights = policy.get_weights()

14
ml-agents/mlagents/trainers/tests/test_policy.py


return {"use_recurrent": False, "model_path": "my/path"}
class FakePolicy(TFPolicy):
def create_tf_graph(self):
pass
def get_trainable_variables(self):
return []
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
# Doesn't really matter what this is
dummy_groupspec = AgentGroupSpec([(1,)], "continuous", 1)
no_agent_step = BatchedStepResult.empty(dummy_groupspec)

def test_take_action_returns_nones_on_missing_values():
test_seed = 3
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy.evaluate = MagicMock(return_value={})
policy.save_memories = MagicMock()
step_with_agents = BatchedStepResult(

def test_take_action_returns_action_info_when_available():
test_seed = 3
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
policy_eval_out = {
"action": np.array([1.0], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),

91
ml-agents/mlagents/trainers/tests/test_ppo.py


import yaml
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.agent_processor import AgentManagerQueue

summary_freq: 1000
use_recurrent: false
normalize: true
memory_size: 8
memory_size: 10
curiosity_strength: 0.0
curiosity_enc_size: 1
summary_path: test

VECTOR_ACTION_SPACE = [2]
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 32
BUFFER_INIT_SAMPLES = 64
def create_ppo_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
mock_brain = mb.setup_mock_brain(
use_discrete,
use_visual,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = dummy_config
model_path = "testmodel"
trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
policy = NNPolicy(
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
)
optimizer = PPOOptimizer(policy, trainer_parameters)
return optimizer
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
# Test evaluate
tf.reset_default_graph()
optimizer = create_ppo_optimizer_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // dummy_config["sequence_length"],
)
@mock.patch("mlagents_envs.environment.UnityEnvironment.executable_launcher")
@mock.patch("mlagents_envs.environment.UnityEnvironment.get_communicator")
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config):

)
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
policy = NNPolicy(0, brain_params, dummy_config, False, False)
policy = NNPolicy(
0, brain_params, dummy_config, False, False, create_tf_graph=False
)
optimizer = PPOOptimizer(policy, dummy_config)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,

action_space=[2],
)
run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False)
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False
)
assert type(val) is float
assert len(val) == 15
run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True)
for key, val in run_out.items():
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=True
)
for key, val in final_value_out.items():
policy.reward_signals["extrinsic"].use_terminal_states = False
run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True)
for key, val in run_out.items():
optimizer.reward_signals["extrinsic"].use_terminal_states = False
run_out, final_value_out = optimizer.get_trajectory_value_estimates(
trajectory.to_agentbuffer(), trajectory.next_obs, done=False
)
for key, val in final_value_out.items():
agentbuffer = trajectory.to_agentbuffer()
batched_values = policy.get_batched_value_estimates(agentbuffer)
for values in batched_values.values():
assert len(values) == 15
def test_rl_functions():
rewards = np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32)

)
def test_trainer_increment_step(dummy_config):
@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
def test_trainer_increment_step(ppo_optimizer, dummy_config):
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer
brain_params = BrainParameters(
brain_name="test_brain",
vector_observation_space_size=1,

assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_add_get_policy(dummy_config):
@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
def test_add_get_policy(ppo_optimizer, dummy_config):
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
ppo_optimizer.return_value = mock_optimizer
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["reward_signals"].update(reward_signal_config)
trainer_parameters["use_recurrent"] = use_rnn
policy = NNPolicy(0, mock_brain, trainer_parameters, False, False)
policy = NNPolicy(
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
)
if trainer_parameters["trainer"] == "sac":
optimizer = SACOptimizer(policy, trainer_parameters)
else:

255
ml-agents/mlagents/trainers/tests/test_sac.py


from unittest import mock
import yaml
import numpy as np
from mlagents.trainers.sac.models import SACModel
from mlagents.trainers.sac.policy import SACPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.common.nn_policy import NNPolicy
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory

VECTOR_ACTION_SPACE = [2]
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 32
BUFFER_INIT_SAMPLES = 64
def create_sac_policy_mock(dummy_config, use_rnn, use_discrete, use_visual):
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
mock_brain = mb.setup_mock_brain(
use_discrete,
use_visual,

trainer_parameters["model_path"] = model_path
trainer_parameters["keep_checkpoints"] = 3
trainer_parameters["use_recurrent"] = use_rnn
policy = SACPolicy(0, mock_brain, trainer_parameters, False, False)
return policy
policy = NNPolicy(
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
)
optimizer = SACOptimizer(policy, trainer_parameters)
return optimizer
def test_sac_cc_policy(dummy_config):
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
optimizer = create_sac_optimizer_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])

dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
policy = create_sac_policy_mock(
optimizer = create_sac_optimizer_mock(
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
policy.update_reward_signals(
optimizer.update_reward_signals(
def test_sac_dc_policy(dummy_config):
# Test evaluate
tf.reset_default_graph()
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=True, use_visual=False
)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
def test_sac_visual_policy(dummy_config):
# Test evaluate
tf.reset_default_graph()
policy = create_sac_policy_mock(
dummy_config, use_rnn=False, use_discrete=True, use_visual=True
)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
# Mock out reward signal eval
update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
run_out = policy.update(update_buffer, num_sequences=update_buffer.num_experiences)
assert type(run_out) is dict
def test_sac_rnn_policy(dummy_config):
# Test evaluate
tf.reset_default_graph()
policy = create_sac_policy_mock(
dummy_config, use_rnn=True, use_discrete=True, use_visual=False
)
step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)
run_out = policy.evaluate(step, list(step.agent_id))
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain, memory_size=8)
# Mock out reward signal eval
buffer["extrinsic_rewards"] = buffer["environment_rewards"]
update_buffer = AgentBuffer()
buffer.resequence_and_append(update_buffer, training_length=policy.sequence_length)
run_out = policy.update(
update_buffer,
num_sequences=update_buffer.num_experiences // policy.sequence_length,
)
def test_sac_model_cc_vector():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
model = SACModel(
make_brain_parameters(discrete_action=False, visual_inputs=0)
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.value, model.entropy, model.learning_rate]
feed_dict = {
model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_model_cc_visual():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
model = SACModel(
make_brain_parameters(discrete_action=False, visual_inputs=2)
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.value, model.entropy, model.learning_rate]
feed_dict = {
model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_model_dc_visual():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
model = SACModel(
make_brain_parameters(discrete_action=True, visual_inputs=2)
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.value, model.entropy, model.learning_rate]
feed_dict = {
model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_model_dc_vector():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
model = SACModel(
make_brain_parameters(discrete_action=True, visual_inputs=0)
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.value, model.entropy, model.learning_rate]
feed_dict = {
model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_model_dc_vector_rnn():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
memory_size = 128
model = SACModel(
make_brain_parameters(discrete_action=True, visual_inputs=0),
use_recurrent=True,
m_size=memory_size,
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [
model.output,
model.all_log_probs,
model.value,
model.entropy,
model.learning_rate,
model.memory_out,
]
feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [[0], [0]],
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([1, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_model_cc_vector_rnn():
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
memory_size = 128
model = SACModel(
make_brain_parameters(discrete_action=False, visual_inputs=0),
use_recurrent=True,
m_size=memory_size,
)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [
model.output,
model.all_log_probs,
model.value,
model.entropy,
model.learning_rate,
model.memory_out,
]
feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
}
sess.run(run_list, feed_dict=feed_dict)
def test_sac_save_load_buffer(tmpdir, dummy_config):
mock_brain = mb.setup_mock_brain(
False,

assert trainer2.update_buffer.num_experiences == buffer_len
def test_add_get_policy(dummy_config):
@mock.patch("mlagents.trainers.sac.trainer.SACOptimizer")
def test_add_get_policy(sac_optimizer, dummy_config):
mock_optimizer = mock.Mock()
mock_optimizer.reward_signals = {}
sac_optimizer.return_value = mock_optimizer
policy = mock.Mock(spec=SACPolicy)
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000
trainer.add_policy(brain_params.brain_name, policy)

7
ml-agents/mlagents/trainers/tf_policy.py


"The memory size for brain {0} is 0 even "
"though the trainer uses recurrent.".format(brain.brain_name)
)
elif self.m_size % 4 != 0:
raise UnityPolicyException(
"The memory size for brain {0} is {1} "
"but it must be divisible by 4.".format(
brain.brain_name, self.m_size
)
)
self._initialize_tensorflow_references()
self.load = load

123
ml-agents/mlagents/trainers/tests/test_multigpu.py


from unittest import mock
import pytest
from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuNNPolicy
from mlagents.trainers.tests.mock_brain import create_mock_brainparams
@pytest.fixture
def dummy_config():
return yaml.safe_load(
"""
trainer: ppo
batch_size: 32
beta: 5.0e-3
buffer_size: 512
epsilon: 0.2
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
normalize: true
num_epoch: 5
num_layers: 2
time_horizon: 64
sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
curiosity_strength: 0.0
curiosity_enc_size: 1
reward_signals:
extrinsic:
strength: 1.0
gamma: 0.99
"""
)
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
def test_create_model(mock_get_devices, dummy_config):
tf.reset_default_graph()
mock_get_devices.return_value = [
"/device:GPU:0",
"/device:GPU:1",
"/device:GPU:2",
"/device:GPU:3",
]
trainer_parameters = dummy_config
trainer_parameters["model_path"] = ""
trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
assert len(policy.towers) == len(mock_get_devices.return_value)
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
def test_average_gradients(mock_get_devices, dummy_config):
tf.reset_default_graph()
mock_get_devices.return_value = [
"/device:GPU:0",
"/device:GPU:1",
"/device:GPU:2",
"/device:GPU:3",
]
trainer_parameters = dummy_config
trainer_parameters["model_path"] = ""
trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
with tf.Session() as sess:
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
var = tf.Variable(0)
tower_grads = [
[(tf.constant(0.1), var)],
[(tf.constant(0.2), var)],
[(tf.constant(0.3), var)],
[(tf.constant(0.4), var)],
]
avg_grads = policy.average_gradients(tower_grads)
init = tf.global_variables_initializer()
sess.run(init)
run_out = sess.run(avg_grads)
assert run_out == [(0.25, 0)]
@mock.patch("mlagents.trainers.tf_policy.TFPolicy._execute_model")
@mock.patch("mlagents.trainers.common.nn_policy.NNPolicy.construct_feed_dict")
@mock.patch("mlagents.trainers.ppo.multi_gpu_policy.get_devices")
def test_update(
mock_get_devices, mock_construct_feed_dict, mock_execute_model, dummy_config
):
tf.reset_default_graph()
mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"]
mock_construct_feed_dict.return_value = {}
mock_execute_model.return_value = {
"value_loss": 0.1,
"policy_loss": 0.3,
"update_batch": None,
}
trainer_parameters = dummy_config
trainer_parameters["model_path"] = ""
trainer_parameters["keep_checkpoints"] = 3
brain = create_mock_brainparams()
policy = MultiGpuNNPolicy(0, brain, trainer_parameters, False, False)
mock_mini_batch = mock.Mock()
mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3, 4])]
run_out = policy.update(mock_mini_batch, 1)
assert mock_mini_batch.items.call_count == len(mock_get_devices.return_value)
assert mock_construct_feed_dict.call_count == len(mock_get_devices.return_value)
assert run_out["Losses/Value Loss"] == 0.1
assert run_out["Losses/Policy Loss"] == 0.3
if __name__ == "__main__":
pytest.main()
正在加载...
取消
保存