fix torch test_ppo

4 年前 · 293bd20b
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
-    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
+    if discrete:
+        update_buffer["discrete_log_probs"] = np.ones_like(
+            update_buffer["discrete_action"]
+        )
+    else:
+        update_buffer["continuous_log_probs"] = np.ones_like(
+            update_buffer["continuous_action"]
+        )
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
-    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
+    if discrete:
+        update_buffer["discrete_log_probs"] = np.ones_like(
+            update_buffer["discrete_action"]
+        )
+    else:
+        update_buffer["continuous_log_probs"] = np.ones_like(
+            update_buffer["continuous_action"]
+        )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
+    update_buffer["continuous_log_probs"] = np.ones_like(
+        update_buffer["continuous_action"]
+    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
-    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
+    update_buffer["continuous_log_probs"] = np.ones_like(
+        update_buffer["continuous_action"]
+    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,