fix tensorflow test ppo

4 年前 · 8c42dcc7
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if discrete:
+        n_agents = len(update_buffer["discrete_log_probs"])
+        update_buffer["discrete_log_probs"] = np.ones(
+            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
+            dtype=np.float32,
+        )
+    else:
+        n_agents = len(update_buffer["continuous_log_probs"])
+        update_buffer["continuous_log_probs"] = np.ones(
+            (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+        )
+
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if discrete:
+        n_agents = len(update_buffer["discrete_log_probs"])
+        update_buffer["discrete_log_probs"] = np.ones(
+            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
+            dtype=np.float32,
+        )
+    else:
+        n_agents = len(update_buffer["continuous_log_probs"])
+        update_buffer["continuous_log_probs"] = np.ones(
+            (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+        )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
        use_visual=False,
    )
    # Test update
-    update_buffer = mb.simulate_rollout(
-        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
-    )
+    behavior_spec = optimizer.policy.behavior_spec
+    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    n_agents = len(update_buffer["continuous_log_probs"])
+    update_buffer["continuous_log_probs"] = np.ones(
+        (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
+    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
-
+    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
+    if use_discrete:
+        n_agents = len(buffer["discrete_log_probs"])
+        buffer["discrete_log_probs"].reset_field()
+        for _ in range(n_agents):
+            buffer["discrete_log_probs"].append(
+                np.ones(
+                    int(sum(mock_behavior_spec.action_spec.discrete_branches)),
+                    dtype=np.float32,
+                )
+            )
+    else:
+        n_agents = len(buffer["continuous_log_probs"])
+        buffer["continuous_log_probs"].reset_field()
+        for _ in range(n_agents):
+            buffer["continuous_log_probs"].append(
+                np.ones(
+                    mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
+                )
+            )
    trainer.update_buffer = buffer
    trainer._update_policy()

--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    if discrete:
-        update_buffer["discrete_log_probs"] = np.ones_like(
-            update_buffer["discrete_action"]
-        )
-    else:
-        update_buffer["continuous_log_probs"] = np.ones_like(
-            update_buffer["continuous_action"]
-        )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
-    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
-    # in PyTorch it is saved as the total probability per branch. So we need to modify the
-    # log prob in the fake buffer here.
-    update_buffer["continuous_log_probs"] = np.ones_like(
-        update_buffer["continuous_action"]
-    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,