浏览代码

Fix RL tests

/develop-newnormalization
Ervin Teng 5 年前
当前提交
29cdf77a
共有 4 个文件被更改,包括 41 次插入45 次删除
  1. 15
      ml-agents/mlagents/trainers/tests/mock_brain.py
  2. 18
      ml-agents/mlagents/trainers/tests/test_ppo.py
  3. 8
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  4. 45
      ml-agents/mlagents/trainers/tests/test_sac.py

15
ml-agents/mlagents/trainers/tests/mock_brain.py


import numpy as np
from mlagents.envs.brain import CameraResolution, BrainParameters
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.buffer import AgentProcessorBuffer, AgentBuffer
def create_mock_brainparams(

# If a key_list was given, remove those keys
if exclude_key_list:
for key in exclude_key_list:
if key in buffer.update_buffer:
buffer.update_buffer.pop(key)
if key in buffer:
buffer.pop(key)
buffer = Buffer()
buffer = AgentProcessorBuffer()
update_buffer = AgentBuffer()
# Make a buffer
for idx, experience in enumerate(brain_infos):
if idx > len(brain_infos) - 2:

)
buffer[0]["memory"].append(np.ones(memory_size))
buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length)
return buffer
buffer.append_update_buffer(
update_buffer, 0, batch_size=None, training_length=sequence_length
)
return update_buffer
def setup_mock_env_and_brains(

18
ml-agents/mlagents/trainers/tests/test_ppo.py


# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
# Mock out reward signal eval
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["extrinsic_returns"] = buffer.update_buffer["rewards"]
buffer.update_buffer["extrinsic_value_estimates"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_returns"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_value_estimates"] = buffer.update_buffer["rewards"]
buffer["extrinsic_rewards"] = buffer["rewards"]
buffer["extrinsic_returns"] = buffer["rewards"]
buffer["extrinsic_value_estimates"] = buffer["rewards"]
buffer["curiosity_rewards"] = buffer["rewards"]
buffer["curiosity_returns"] = buffer["rewards"]
buffer["curiosity_value_estimates"] = buffer["rewards"]
trainer.training_buffer = buffer
trainer.update_buffer = buffer
trainer.update_policy()
# Make batch length a larger multiple of sequence length
trainer.trainer_parameters["batch_size"] = 128

agent_idx=idx,
agent_next_idx=next_idx,
)
assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
if __name__ == "__main__":

8
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


import mlagents.trainers.tests.mock_brain as mb
import numpy as np
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer
from mlagents.trainers.tests.test_buffer import construct_fake_processing_buffer
from mlagents.trainers.buffer import AgentBuffer

trainer.end_episode()
for agent_id in trainer.episode_steps:
assert trainer.episode_steps[agent_id] == 0
assert len(trainer.training_buffer[agent_id]["action"]) == 0
assert len(trainer.processing_buffer[agent_id]["action"]) == 0
for rewards in trainer.collected_rewards.values():
for agent_id in rewards:
assert rewards[agent_id] == 0

trainer = create_rl_trainer()
trainer.processing_buffer = construct_fake_buffer()
trainer.processing_buffer = construct_fake_processing_buffer()
trainer.training_buffer.append_update_buffer(
trainer.processing_buffer.append_update_buffer(
trainer.update_buffer, 2, batch_size=None, training_length=2
)
trainer.clear_update_buffer()

45
ml-agents/mlagents/trainers/tests/test_sac.py


assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=len(update_buffer["actions"]))
env.close()

)
# Test update, while removing PPO-specific buffer elements.
buffer = mb.simulate_rollout(
update_buffer = mb.simulate_rollout(
env,
policy,
BUFFER_INIT_SAMPLES,

# Mock out reward signal eval
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
buffer.update_buffer["curiosity_rewards"] = buffer.update_buffer["rewards"]
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
update_buffer["curiosity_rewards"] = update_buffer["rewards"]
{"curiosity": buffer.update_buffer},
num_sequences=len(buffer.update_buffer["actions"]),
{"curiosity": update_buffer}, num_sequences=len(update_buffer["actions"])
)
env.close()

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=len(update_buffer["actions"]))
env.close()

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
run_out = policy.update(
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"])
)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
run_out = policy.update(update_buffer, num_sequences=len(update_buffer["actions"]))
assert type(run_out) is dict

assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
# Test update
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES)
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"]
policy.update(buffer.update_buffer, num_sequences=2)
update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
policy.update(update_buffer, num_sequences=2)
env.close()

trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain, 1, trainer_params, True, False, 0, 0)
trainer.training_buffer = mb.simulate_rollout(
trainer.update_buffer = mb.simulate_rollout(
buffer_len = len(trainer.training_buffer.update_buffer["actions"])
buffer_len = len(trainer.update_buffer["actions"])
assert len(trainer2.training_buffer.update_buffer["actions"]) == buffer_len
assert len(trainer2.update_buffer["actions"]) == buffer_len
if __name__ == "__main__":
正在加载...
取消
保存