|
|
|
|
|
|
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0]) |
|
|
|
|
|
|
|
# Test update |
|
|
|
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
policy.update( |
|
|
|
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"]) |
|
|
|
) |
|
|
|
update_buffer["extrinsic_rewards"] = update_buffer["rewards"] |
|
|
|
policy.update(update_buffer, num_sequences=len(update_buffer["actions"])) |
|
|
|
env.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
# Test update, while removing PPO-specific buffer elements. |
|
|
|
buffer = mb.simulate_rollout( |
|
|
|
update_buffer = mb.simulate_rollout( |
|
|
|
env, |
|
|
|
policy, |
|
|
|
BUFFER_INIT_SAMPLES, |
|
|
|
|
|
|
# Mock out reward signal eval |
|
|
|
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
buffer.update_buffer["curiosity_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
update_buffer["extrinsic_rewards"] = update_buffer["rewards"] |
|
|
|
update_buffer["curiosity_rewards"] = update_buffer["rewards"] |
|
|
|
{"curiosity": buffer.update_buffer}, |
|
|
|
num_sequences=len(buffer.update_buffer["actions"]), |
|
|
|
{"curiosity": update_buffer}, num_sequences=len(update_buffer["actions"]) |
|
|
|
) |
|
|
|
env.close() |
|
|
|
|
|
|
|
|
|
|
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) |
|
|
|
|
|
|
|
# Test update |
|
|
|
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
policy.update( |
|
|
|
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"]) |
|
|
|
) |
|
|
|
update_buffer["extrinsic_rewards"] = update_buffer["rewards"] |
|
|
|
policy.update(update_buffer, num_sequences=len(update_buffer["actions"])) |
|
|
|
env.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) |
|
|
|
|
|
|
|
# Test update |
|
|
|
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
run_out = policy.update( |
|
|
|
buffer.update_buffer, num_sequences=len(buffer.update_buffer["actions"]) |
|
|
|
) |
|
|
|
update_buffer["extrinsic_rewards"] = update_buffer["rewards"] |
|
|
|
run_out = policy.update(update_buffer, num_sequences=len(update_buffer["actions"])) |
|
|
|
assert type(run_out) is dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) |
|
|
|
|
|
|
|
# Test update |
|
|
|
buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
update_buffer = mb.simulate_rollout(env, policy, BUFFER_INIT_SAMPLES) |
|
|
|
buffer.update_buffer["extrinsic_rewards"] = buffer.update_buffer["rewards"] |
|
|
|
policy.update(buffer.update_buffer, num_sequences=2) |
|
|
|
update_buffer["extrinsic_rewards"] = update_buffer["rewards"] |
|
|
|
policy.update(update_buffer, num_sequences=2) |
|
|
|
env.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainer_params["model_path"] = str(tmpdir) |
|
|
|
trainer_params["save_replay_buffer"] = True |
|
|
|
trainer = SACTrainer(mock_brain, 1, trainer_params, True, False, 0, 0) |
|
|
|
trainer.training_buffer = mb.simulate_rollout( |
|
|
|
trainer.update_buffer = mb.simulate_rollout( |
|
|
|
buffer_len = len(trainer.training_buffer.update_buffer["actions"]) |
|
|
|
buffer_len = len(trainer.update_buffer["actions"]) |
|
|
|
assert len(trainer2.training_buffer.update_buffer["actions"]) == buffer_len |
|
|
|
assert len(trainer2.update_buffer["actions"]) == buffer_len |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |