|
|
|
|
|
|
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas |
|
|
|
# in PyTorch it is saved as the total probability per branch. So we need to modify the |
|
|
|
# log prob in the fake buffer here. |
|
|
|
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) |
|
|
|
if discrete: |
|
|
|
update_buffer["discrete_log_probs"] = np.ones_like( |
|
|
|
update_buffer["discrete_action"] |
|
|
|
) |
|
|
|
else: |
|
|
|
update_buffer["continuous_log_probs"] = np.ones_like( |
|
|
|
update_buffer["continuous_action"] |
|
|
|
) |
|
|
|
return_stats = optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas |
|
|
|
# in PyTorch it is saved as the total probability per branch. So we need to modify the |
|
|
|
# log prob in the fake buffer here. |
|
|
|
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) |
|
|
|
if discrete: |
|
|
|
update_buffer["discrete_log_probs"] = np.ones_like( |
|
|
|
update_buffer["discrete_action"] |
|
|
|
) |
|
|
|
else: |
|
|
|
update_buffer["continuous_log_probs"] = np.ones_like( |
|
|
|
update_buffer["continuous_action"] |
|
|
|
) |
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"] |
|
|
|
update_buffer["gail_returns"] = update_buffer["environment_rewards"] |
|
|
|
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"] |
|
|
|
update_buffer["continuous_log_probs"] = np.ones_like( |
|
|
|
update_buffer["continuous_action"] |
|
|
|
) |
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|
|
|
|
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas |
|
|
|
# in PyTorch it is saved as the total probability per branch. So we need to modify the |
|
|
|
# log prob in the fake buffer here. |
|
|
|
update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) |
|
|
|
update_buffer["continuous_log_probs"] = np.ones_like( |
|
|
|
update_buffer["continuous_action"] |
|
|
|
) |
|
|
|
optimizer.update( |
|
|
|
update_buffer, |
|
|
|
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, |
|
|
|