浏览代码

fix tensorflow test ppo

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
8c42dcc7
共有 2 个文件被更改,包括 56 次插入27 次删除
  1. 66
      ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
  2. 17
      ml-agents/mlagents/trainers/tests/torch/test_ppo.py

66
ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py


dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if discrete:
n_agents = len(update_buffer["discrete_log_probs"])
update_buffer["discrete_log_probs"] = np.ones(
(n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
dtype=np.float32,
)
else:
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

use_visual=False,
)
# Test update
update_buffer = mb.simulate_rollout(
BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
)
behavior_spec = optimizer.policy.behavior_spec
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]

# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
n_agents = len(update_buffer["continuous_log_probs"])
update_buffer["continuous_log_probs"] = np.ones(
(n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

buffer["curiosity_returns"] = buffer["environment_rewards"]
buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
buffer["advantages"] = buffer["environment_rewards"]
# NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
if use_discrete:
n_agents = len(buffer["discrete_log_probs"])
buffer["discrete_log_probs"].reset_field()
for _ in range(n_agents):
buffer["discrete_log_probs"].append(
np.ones(
int(sum(mock_behavior_spec.action_spec.discrete_branches)),
dtype=np.float32,
)
)
else:
n_agents = len(buffer["continuous_log_probs"])
buffer["continuous_log_probs"].reset_field()
for _ in range(n_agents):
buffer["continuous_log_probs"].append(
np.ones(
mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
)
)
trainer.update_buffer = buffer
trainer._update_policy()

17
ml-agents/mlagents/trainers/tests/torch/test_ppo.py


update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
if discrete:
update_buffer["discrete_log_probs"] = np.ones_like(
update_buffer["discrete_action"]
)
else:
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
# NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
# in PyTorch it is saved as the total probability per branch. So we need to modify the
# log prob in the fake buffer here.
update_buffer["continuous_log_probs"] = np.ones_like(
update_buffer["continuous_action"]
)
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,

正在加载...
取消
保存