fix torch test ppo

4 年前 · 22f42f5b
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
-    prob_ones = np.ones(
-        int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
-        dtype=np.float32,
-    )
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
-        if action_spec.is_continuous():
-            action = ActionTuple(continuous=np.zeros(action_size, dtype=np.float32))
-            action_probs = LogProbsTuple(continuous=prob_ones)
-        else:
-            action = ActionTuple(discrete=np.zeros(action_size, dtype=np.float32))
-            action_probs = LogProbsTuple(discrete=prob_ones)
+        action = ActionTuple(
+            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
+            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
+        )
+        action_probs = LogProbsTuple(
+            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
+            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
+        )
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = (
            [
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]

-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    if discrete:
-        update_buffer["discrete_log_probs"] = np.ones_like(
-            update_buffer["discrete_action"]
-        )
-    else:
-        update_buffer["continuous_log_probs"] = np.ones_like(
-            update_buffer["continuous_action"]
-        )
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,