DISCRETE_ACTION_SPACE = [ 3 , 3 , 3 , 2 ]
BUFFER_INIT_SAMPLES = 32
NUM_AGENTS = 12
EPSILON = 1e-7
def create_policy_mock (
assert run_out [ " action " ] . shape == ( NUM_AGENTS , VECTOR_ACTION_SPACE )
def test_large_normalization ( ) :
behavior_spec = mb . setup_test_behavior_specs (
use_discrete = True , use_visual = False , vector_action_space = [ 2 ] , vector_obs_space = 1
)
# Taken from Walker seed 3713 which causes NaN without proper initialization
large_obs1 = [
1800.00036621 ,
1799.96972656 ,
1800.01245117 ,
1800.07214355 ,
1800.02758789 ,
1799.98303223 ,
1799.88647461 ,
1799.89575195 ,
1800.03479004 ,
1800.14025879 ,
1800.17675781 ,
1800.20581055 ,
1800.33740234 ,
1800.36450195 ,
1800.43457031 ,
1800.45544434 ,
1800.44604492 ,
1800.56713867 ,
1800.73901367 ,
]
large_obs2 = [
1799.99975586 ,
1799.96679688 ,
1799.92980957 ,
1799.89550781 ,
1799.93774414 ,
1799.95300293 ,
1799.94067383 ,
1799.92993164 ,
1799.84057617 ,
1799.69873047 ,
1799.70605469 ,
1799.82849121 ,
1799.85095215 ,
1799.76977539 ,
1799.78283691 ,
1799.76708984 ,
1799.67163086 ,
1799.59191895 ,
1799.5135498 ,
1799.45556641 ,
1799.3717041 ,
]
policy = TFPolicy (
0 ,
behavior_spec ,
TrainerSettings ( network_settings = NetworkSettings ( normalize = True ) ) ,
" testdir " ,
False ,
)
time_horizon = len ( large_obs1 )
trajectory = make_fake_trajectory (
length = time_horizon ,
max_step_complete = True ,
observation_shapes = [ ( 1 , ) ] ,
action_space = [ 2 ] ,
)
for i in range ( time_horizon ) :
trajectory . steps [ i ] . obs [ 0 ] = np . array ( [ large_obs1 [ i ] ] , dtype = np . float32 )
trajectory_buffer = trajectory . to_agentbuffer ( )
policy . update_normalization ( trajectory_buffer [ " vector_obs " ] )
# Check that the running mean and variance is correct
steps , mean , variance = policy . sess . run (
[ policy . normalization_steps , policy . running_mean , policy . running_variance ]
)
assert mean [ 0 ] == pytest . approx ( np . mean ( large_obs1 , dtype = np . float32 ) , abs = 0.01 )
assert variance [ 0 ] / steps == pytest . approx (
np . var ( large_obs1 , dtype = np . float32 ) , abs = 0.01
)
time_horizon = len ( large_obs2 )
trajectory = make_fake_trajectory (
length = time_horizon ,
max_step_complete = True ,
observation_shapes = [ ( 1 , ) ] ,
action_space = [ 2 ] ,
)
for i in range ( time_horizon ) :
trajectory . steps [ i ] . obs [ 0 ] = np . array ( [ large_obs2 [ i ] ] , dtype = np . float32 )
trajectory_buffer = trajectory . to_agentbuffer ( )
policy . update_normalization ( trajectory_buffer [ " vector_obs " ] )
steps , mean , variance = policy . sess . run (
[ policy . normalization_steps , policy . running_mean , policy . running_variance ]
)
assert mean [ 0 ] == pytest . approx (
np . mean ( large_obs1 + large_obs2 , dtype = np . float32 ) , abs = 0.01
)
assert variance [ 0 ] / steps == pytest . approx (
np . var ( large_obs1 + large_obs2 , dtype = np . float32 ) , abs = 0.01
)
time_horizon = 6
trajectory = make_fake_trajectory (
length = time_horizon ,
assert steps == 6
assert mean [ 0 ] == 0.5
# Note: variance is divided by number of steps, and initialized to 1 to avoid
# divide by 0. The right answer is 0.25
assert ( variance [ 0 ] - 1 ) / steps == 0.25
# Note: variance is initalized to the variance of the initial trajectory + EPSILON
# (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
assert variance [ 0 ] / steps == pytest . approx ( 0.25 , abs = 0.01 )
# Make another update, this time with all 1's
time_horizon = 10
trajectory = make_fake_trajectory (
assert steps == 16
assert mean [ 0 ] == 0.8125
assert ( variance [ 0 ] - 1 ) / steps == pytest . approx ( 0.152 , abs = 0.01 )
assert variance [ 0 ] / steps == pytest . approx ( 0.152 , abs = 0.01 )
def test_min_visual_size ( ) :