large normalization obs unit test

4 年前 · 4b094d25
--- a/ml-agents/mlagents/trainers/tests/mock_brain.py
+++ b/ml-agents/mlagents/trainers/tests/mock_brain.py
            memory=memory,
        )
        steps_list.append(experience)
+    obs = []
+    for _shape in observation_shapes:
+        obs.append(np.ones(_shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
--- a/ml-agents/mlagents/trainers/tests/test_nn_policy.py
+++ b/ml-agents/mlagents/trainers/tests/test_nn_policy.py
 DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
 BUFFER_INIT_SAMPLES = 32
 NUM_AGENTS = 12
+EPSILON = 1e-7


 def create_policy_mock(
        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)


+def test_large_normalization():
+    behavior_spec = mb.setup_test_behavior_specs(
+        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
+    )
+    # Taken from Walker seed 3713 which causes NaN without proper initialization
+    large_obs1 = [
+        1800.00036621,
+        1799.96972656,
+        1800.01245117,
+        1800.07214355,
+        1800.02758789,
+        1799.98303223,
+        1799.88647461,
+        1799.89575195,
+        1800.03479004,
+        1800.14025879,
+        1800.17675781,
+        1800.20581055,
+        1800.33740234,
+        1800.36450195,
+        1800.43457031,
+        1800.45544434,
+        1800.44604492,
+        1800.56713867,
+        1800.73901367,
+    ]
+    large_obs2 = [
+        1799.99975586,
+        1799.96679688,
+        1799.92980957,
+        1799.89550781,
+        1799.93774414,
+        1799.95300293,
+        1799.94067383,
+        1799.92993164,
+        1799.84057617,
+        1799.69873047,
+        1799.70605469,
+        1799.82849121,
+        1799.85095215,
+        1799.76977539,
+        1799.78283691,
+        1799.76708984,
+        1799.67163086,
+        1799.59191895,
+        1799.5135498,
+        1799.45556641,
+        1799.3717041,
+    ]
+    policy = TFPolicy(
+        0,
+        behavior_spec,
+        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
+        "testdir",
+        False,
+    )
+    time_horizon = len(large_obs1)
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        observation_shapes=[(1,)],
+        action_space=[2],
+    )
+    for i in range(time_horizon):
+        trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    # Check that the running mean and variance is correct
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+    assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
+    assert variance[0] / steps == pytest.approx(
+        np.var(large_obs1, dtype=np.float32), abs=0.01
+    )
+
+    time_horizon = len(large_obs2)
+    trajectory = make_fake_trajectory(
+        length=time_horizon,
+        max_step_complete=True,
+        observation_shapes=[(1,)],
+        action_space=[2],
+    )
+    for i in range(time_horizon):
+        trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)
+
+    trajectory_buffer = trajectory.to_agentbuffer()
+    policy.update_normalization(trajectory_buffer["vector_obs"])
+
+    steps, mean, variance = policy.sess.run(
+        [policy.normalization_steps, policy.running_mean, policy.running_variance]
+    )
+
+    assert mean[0] == pytest.approx(
+        np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
+    )
+    assert variance[0] / steps == pytest.approx(
+        np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
+    )
+
+
-
    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,

    assert steps == 6
    assert mean[0] == 0.5
-    # Note: variance is divided by number of steps, and initialized to 1 to avoid
-    # divide by 0. The right answer is 0.25
-    assert (variance[0] - 1) / steps == 0.25
-
+    # Note: variance is initalized to the variance of the initial trajectory + EPSILON
+    # (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
+    assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(

    assert steps == 16
    assert mean[0] == 0.8125
-    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
+    assert variance[0] / steps == pytest.approx(0.152, abs=0.01)


 def test_min_visual_size():
--- a/ml-agents/mlagents/trainers/tf/models.py
+++ b/ml-agents/mlagents/trainers/tf/models.py
        # First mean and variance calculated normally
        initial_mean, initial_variance = tf.nn.moments(vector_input, axes=[0])
        initialize_mean = tf.assign(running_mean, initial_mean)
-        initialize_variance = tf.assign(running_variance, initial_variance + EPSILON)
+        # Multiplied by total_new_step because it is divided by total_new_step in the normalization
+        initialize_variance = tf.assign(
+            running_variance,
+            (initial_variance + EPSILON) * tf.cast(total_new_steps, dtype=tf.float32),
+        )
        return (
            tf.group([initialize_mean, initialize_variance, update_norm_step]),
            tf.group([update_mean, update_variance, update_norm_step]),