|
|
|
|
|
|
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual |
|
|
|
) |
|
|
|
# Test update |
|
|
|
update_buffer = mb.simulate_rollout( |
|
|
|
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec |
|
|
|
) |
|
|
|
behavior_spec = optimizer.policy.behavior_spec |
|
|
|
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) |
|
|
|
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not |
|
|
|
if discrete: |
|
|
|
n_agents = len(update_buffer["discrete_log_probs"]) |
|
|
|
update_buffer["discrete_log_probs"] = np.ones( |
|
|
|
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))), |
|
|
|
dtype=np.float32, |
|
|
|
) |
|
|
|
else: |
|
|
|
n_agents = len(update_buffer["continuous_log_probs"]) |
|
|
|
update_buffer["continuous_log_probs"] = np.ones( |
|
|
|
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32 |
|
|
|
) |
|
|
|
|
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual |
|
|
|
) |
|
|
|
# Test update |
|
|
|
update_buffer = mb.simulate_rollout( |
|
|
|
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec |
|
|
|
) |
|
|
|
behavior_spec = optimizer.policy.behavior_spec |
|
|
|
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) |
|
|
|
# Mock out reward signal eval |
|
|
|
update_buffer["advantages"] = update_buffer["environment_rewards"] |
|
|
|
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] |
|
|
|
|
|
|
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not |
|
|
|
if discrete: |
|
|
|
n_agents = len(update_buffer["discrete_log_probs"]) |
|
|
|
update_buffer["discrete_log_probs"] = np.ones( |
|
|
|
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))), |
|
|
|
dtype=np.float32, |
|
|
|
) |
|
|
|
else: |
|
|
|
n_agents = len(update_buffer["continuous_log_probs"]) |
|
|
|
update_buffer["continuous_log_probs"] = np.ones( |
|
|
|
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32 |
|
|
|
) |
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
use_visual=False, |
|
|
|
) |
|
|
|
# Test update |
|
|
|
update_buffer = mb.simulate_rollout( |
|
|
|
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec |
|
|
|
) |
|
|
|
behavior_spec = optimizer.policy.behavior_spec |
|
|
|
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) |
|
|
|
# Mock out reward signal eval |
|
|
|
update_buffer["advantages"] = update_buffer["environment_rewards"] |
|
|
|
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] |
|
|
|
|
|
|
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not |
|
|
|
n_agents = len(update_buffer["continuous_log_probs"]) |
|
|
|
update_buffer["continuous_log_probs"] = np.ones( |
|
|
|
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32 |
|
|
|
) |
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
buffer["curiosity_returns"] = buffer["environment_rewards"] |
|
|
|
buffer["curiosity_value_estimates"] = buffer["environment_rewards"] |
|
|
|
buffer["advantages"] = buffer["environment_rewards"] |
|
|
|
|
|
|
|
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not |
|
|
|
if use_discrete: |
|
|
|
n_agents = len(buffer["discrete_log_probs"]) |
|
|
|
buffer["discrete_log_probs"].reset_field() |
|
|
|
for _ in range(n_agents): |
|
|
|
buffer["discrete_log_probs"].append( |
|
|
|
np.ones( |
|
|
|
int(sum(mock_behavior_spec.action_spec.discrete_branches)), |
|
|
|
dtype=np.float32, |
|
|
|
) |
|
|
|
) |
|
|
|
else: |
|
|
|
n_agents = len(buffer["continuous_log_probs"]) |
|
|
|
buffer["continuous_log_probs"].reset_field() |
|
|
|
for _ in range(n_agents): |
|
|
|
buffer["continuous_log_probs"].append( |
|
|
|
np.ones( |
|
|
|
mock_behavior_spec.action_spec.continuous_size, dtype=np.float32 |
|
|
|
) |
|
|
|
) |
|
|
|
trainer.update_buffer = buffer |
|
|
|
trainer._update_policy() |
|
|
|
|
|
|
|