浏览代码

torch reward providers all pass

/develop/action-spec-gym
Andrew Cohen 4 年前
当前提交
24fd9b3c
共有 9 个文件被更改,包括 385 次插入370 次删除
  1. 9
      ml-agents/mlagents/trainers/demo_loader.py
  2. 14
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  3. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  4. 2
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  5. 684
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  6. 20
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  7. 12
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  8. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  9. 6
      ml-agents/mlagents/trainers/torch/utils.py

9
ml-agents/mlagents/trainers/demo_loader.py


for i, obs in enumerate(split_obs.visual_observations):
demo_raw_buffer["visual_obs%d" % i].append(obs)
demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
if behavior_spec.action_spec.is_continuous():
demo_raw_buffer["continuous_action"].append(
current_pair_info.action_info.vector_actions
)
else:
demo_raw_buffer["discrete_action"].append(
current_pair_info.action_info.vector_actions
)
demo_raw_buffer["prev_action"].append(previous_action)
if next_done:
demo_raw_buffer.resequence_and_append(

14
ml-agents/mlagents/trainers/tests/simple_test_envs.py


def step(self) -> None:
super().step()
for name in self.names:
if self.discrete:
action = self.action[name].discrete
else:
action = self.action[name].continuous
self.step_result[name][0], self.step_result[name][1], self.action[name]
self.step_result[name][0], self.step_result[name][1], action
)
self.demonstration_protos[name] = self.demonstration_protos[name][
-self.n_demos :

for _ in range(self.n_demos):
for name in self.names:
if self.discrete:
self.action[name] = [[1]] if self.goal[name] > 0 else [[0]]
self.action[name] = ActionBuffers(
[[]], np.array([[1]] if self.goal[name] > 0 else [[0]])
)
self.action[name] = [[float(self.goal[name])]]
self.action[name] = ActionBuffers(
np.array([[float(self.goal[name])]]), [[]]
)
self.step()

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
target = torch.tensor(buffer["continuous_action"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

2
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
for _act_type, _act in action.items():
buffer[_act_type].append(_act)
buffer[_act_type].append(_act[0, :])
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

684
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


check_environment_trains(env, {BRAIN_NAME: config})
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_2d_ppo(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_ppo(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_ppo(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=900,
summary_freq=100,
)
# The number of steps is pretty small for these encoders
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_ppo(use_discrete):
env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=128,
)
config = attr.evolve(
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=5000,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("num_visual", [1, 2])
# def test_visual_ppo(num_visual, use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=use_discrete,
# num_visual=num_visual,
# num_vector=0,
# step_size=0.2,
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
# )
# config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams)
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
# @pytest.mark.parametrize("num_visual", [1, 2])
# @pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
# def test_visual_advanced_ppo(vis_encode_type, num_visual):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=True,
# num_visual=num_visual,
# num_vector=0,
# step_size=0.5,
# vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
# )
# new_networksettings = attr.evolve(
# SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG,
# hyperparameters=new_hyperparams,
# network_settings=new_networksettings,
# max_steps=900,
# summary_freq=100,
# )
# # The number of steps is pretty small for these encoders
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_recurrent_ppo(use_discrete):
# env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# new_network_settings = attr.evolve(
# PPO_TORCH_CONFIG.network_settings,
# memory=NetworkSettings.MemorySettings(memory_size=16),
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters,
# learning_rate=1.0e-3,
# batch_size=64,
# buffer_size=128,
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG,
# hyperparameters=new_hyperparams,
# network_settings=new_network_settings,
# max_steps=5000,
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)

# @pytest.mark.parametrize("use_discrete", [True])
# def test_2d_sac(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
@pytest.mark.parametrize("use_discrete", [True])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("num_visual", [1, 2])
# def test_visual_sac(num_visual, use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=use_discrete,
# num_visual=num_visual,
# num_vector=0,
# step_size=0.2,
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
# )
# config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams)
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
# @pytest.mark.parametrize("num_visual", [1, 2])
# @pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
# def test_visual_advanced_sac(vis_encode_type, num_visual):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=True,
# num_visual=num_visual,
# num_vector=0,
# step_size=0.5,
# vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
# )
# new_networksettings = attr.evolve(
# SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters,
# batch_size=16,
# learning_rate=3e-4,
# buffer_init_steps=0,
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG,
# hyperparameters=new_hyperparams,
# network_settings=new_networksettings,
# max_steps=100,
# )
# # The number of steps is pretty small for these encoders
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_recurrent_sac(use_discrete):
# step_size = 0.2 if use_discrete else 0.5
# env = MemoryEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
# )
# new_networksettings = attr.evolve(
# SAC_TORCH_CONFIG.network_settings,
# memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters,
# batch_size=256,
# learning_rate=1e-3,
# buffer_init_steps=1000,
# steps_per_update=2,
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG,
# hyperparameters=new_hyperparams,
# network_settings=new_networksettings,
# max_steps=2000,
# )
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_ghost(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
# )
# self_play_settings = SelfPlaySettings(
# play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
# )
# config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_ghost_fails(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
# )
# # This config should fail because the ghosted policy is never swapped with a competent policy.
# # Swap occurs after max step is reached.
# self_play_settings = SelfPlaySettings(
# play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
# )
# config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
# processed_rewards = [
# default_reward_processor(rewards) for rewards in env.final_rewards.values()
# ]
# success_threshold = 0.9
# assert any(reward > success_threshold for reward in processed_rewards) and any(
# reward < success_threshold for reward in processed_rewards
# )
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_asymm_ghost(use_discrete):
# # Make opponent for asymmetric case
# brain_name_opp = BRAIN_NAME + "Opp"
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
# )
# self_play_settings = SelfPlaySettings(
# play_against_latest_model_ratio=1.0,
# save_steps=10000,
# swap_steps=10000,
# team_change=400,
# )
# config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000)
# check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_asymm_ghost_fails(use_discrete):
# # Make opponent for asymmetric case
# brain_name_opp = BRAIN_NAME + "Opp"
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
# )
# # This config should fail because the team that us not learning when both have reached
# # max step should be executing the initial, untrained poliy.
# self_play_settings = SelfPlaySettings(
# play_against_latest_model_ratio=0.0,
# save_steps=5000,
# swap_steps=5000,
# team_change=2000,
# )
# config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000)
# check_environment_trains(
# env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
# )
# processed_rewards = [
# default_reward_processor(rewards) for rewards in env.final_rewards.values()
# ]
# success_threshold = 0.9
# assert any(reward > success_threshold for reward in processed_rewards) and any(
# reward < success_threshold for reward in processed_rewards
# )
#
#
# @pytest.fixture(scope="session")
# def simple_record(tmpdir_factory):
# def record_demo(use_discrete, num_visual=0, num_vector=1):
# env = RecordEnvironment(
# [BRAIN_NAME],
# use_discrete=use_discrete,
# num_visual=num_visual,
# num_vector=num_vector,
# n_demos=100,
# )
# # If we want to use true demos, we can solve the env in the usual way
# # Otherwise, we can just call solve to execute the optimal policy
# env.solve()
# agent_info_protos = env.demonstration_protos[BRAIN_NAME]
# meta_data_proto = DemonstrationMetaProto()
# brain_param_proto = BrainParametersProto(
# vector_action_size=[2] if use_discrete else [1],
# vector_action_descriptions=[""],
# vector_action_space_type=discrete if use_discrete else continuous,
# brain_name=BRAIN_NAME,
# is_training=True,
# )
# action_type = "Discrete" if use_discrete else "Continuous"
# demo_path_name = "1DTest" + action_type + ".demo"
# demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
# write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
# return demo_path
#
# return record_demo
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
# def test_gail(simple_record, use_discrete, trainer_config):
# demo_path = simple_record(use_discrete)
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
# bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
# reward_signals = {
# RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
# }
# config = attr.evolve(
# trainer_config,
# reward_signals=reward_signals,
# behavioral_cloning=bc_settings,
# max_steps=500,
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_gail_visual_ppo(simple_record, use_discrete):
# demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
# env = SimpleEnvironment(
# [BRAIN_NAME],
# num_visual=1,
# num_vector=0,
# use_discrete=use_discrete,
# step_size=0.2,
# )
# bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
# reward_signals = {
# RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
# }
# hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3e-4)
# config = attr.evolve(
# PPO_TORCH_CONFIG,
# reward_signals=reward_signals,
# hyperparameters=hyperparams,
# behavioral_cloning=bc_settings,
# max_steps=1000,
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_gail_visual_sac(simple_record, use_discrete):
# demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
# env = SimpleEnvironment(
# [BRAIN_NAME],
# num_visual=1,
# num_vector=0,
# use_discrete=use_discrete,
# step_size=0.2,
# )
# bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
# reward_signals = {
# RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
# }
# hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG,
# reward_signals=reward_signals,
# hyperparameters=hyperparams,
# behavioral_cloning=bc_settings,
# max_steps=500,
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_sac(num_visual, use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
)
config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("num_visual", [1, 2])
@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
def test_visual_advanced_sac(vis_encode_type, num_visual):
env = SimpleEnvironment(
[BRAIN_NAME],
use_discrete=True,
num_visual=num_visual,
num_vector=0,
step_size=0.5,
vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
batch_size=16,
learning_rate=3e-4,
buffer_init_steps=0,
)
config = attr.evolve(
SAC_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=100,
)
# The number of steps is pretty small for these encoders
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_recurrent_sac(use_discrete):
step_size = 0.2 if use_discrete else 0.5
env = MemoryEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, step_size=step_size
)
new_networksettings = attr.evolve(
SAC_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters,
batch_size=256,
learning_rate=1e-3,
buffer_init_steps=1000,
steps_per_update=2,
)
config = attr.evolve(
SAC_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_networksettings,
max_steps=2000,
)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
)
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ghost_fails(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the ghosted policy is never swapped with a competent policy.
# Swap occurs after max step is reached.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
)
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=1.0,
save_steps=10000,
swap_steps=10000,
team_change=400,
)
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000)
check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_asymm_ghost_fails(use_discrete):
# Make opponent for asymmetric case
brain_name_opp = BRAIN_NAME + "Opp"
env = SimpleEnvironment(
[BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], use_discrete=use_discrete
)
# This config should fail because the team that us not learning when both have reached
# max step should be executing the initial, untrained poliy.
self_play_settings = SelfPlaySettings(
play_against_latest_model_ratio=0.0,
save_steps=5000,
swap_steps=5000,
team_change=2000,
)
config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000)
check_environment_trains(
env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
)
processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)
@pytest.fixture(scope="session")
def simple_record(tmpdir_factory):
def record_demo(use_discrete, num_visual=0, num_vector=1):
env = RecordEnvironment(
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=num_vector,
n_demos=100,
)
# If we want to use true demos, we can solve the env in the usual way
# Otherwise, we can just call solve to execute the optimal policy
env.solve()
agent_info_protos = env.demonstration_protos[BRAIN_NAME]
meta_data_proto = DemonstrationMetaProto()
brain_param_proto = BrainParametersProto(
vector_action_size=[2] if use_discrete else [1],
vector_action_descriptions=[""],
vector_action_space_type=discrete if use_discrete else continuous,
brain_name=BRAIN_NAME,
is_training=True,
)
action_type = "Discrete" if use_discrete else "Continuous"
demo_path_name = "1DTest" + action_type + ".demo"
demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
return demo_path
return record_demo
@pytest.mark.parametrize("use_discrete", [True, False])
@pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
def test_gail(simple_record, use_discrete, trainer_config):
demo_path = simple_record(use_discrete)
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
config = attr.evolve(
trainer_config,
reward_signals=reward_signals,
behavioral_cloning=bc_settings,
max_steps=500,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_ppo(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3e-4)
config = attr.evolve(
PPO_TORCH_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=1000,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_gail_visual_sac(simple_record, use_discrete):
demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
env = SimpleEnvironment(
[BRAIN_NAME],
num_visual=1,
num_vector=0,
use_discrete=use_discrete,
step_size=0.2,
)
bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
reward_signals = {
RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
}
hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
)
config = attr.evolve(
SAC_TORCH_CONFIG,
reward_signals=reward_signals,
hyperparameters=hyperparams,
behavioral_cloning=bc_settings,
max_steps=500,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)

20
ml-agents/mlagents/trainers/torch/components/bc/module.py


update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
return update_stats
def _behavioral_cloning_loss(self, selected_actions: AgentAction, log_probs: ActionLogProbs, expert_actions: torch.Tensor):
def _behavioral_cloning_loss(
self,
selected_actions: AgentAction,
log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
):
bc_loss = torch.nn.functional.mse_loss(selected_actions.continuous_tensor, expert_actions)
bc_loss = torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions
)
log_probs.all_discrete_tensor, self.policy.behavior_spec.action_spec.discrete_branches
log_probs.all_discrete_tensor,
self.policy.behavior_spec.action_spec.discrete_branches,
)
bc_loss = torch.mean(
torch.stack(

vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(mini_batch_demo["actions"])
expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["continuous_action"]
)
mini_batch_demo["actions"], dtype=torch.long
mini_batch_demo["discrete_action"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size

12
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


else:
action = torch.cat(
ModelUtils.actions_to_onehot(
actions.discrete_tensor,
self._action_spec.discrete_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
print(self.get_current_state(mini_batch), action)
forward_model_input = torch.cat(
(self.get_current_state(mini_batch), action), dim=1
)

predicted_action = self.predict_action(mini_batch)
actions = AgentAction.extract(mini_batch)
if self._action_spec.is_continuous():
sq_difference = (
actions.continuous_tensor
- predicted_action
) ** 2
sq_difference = (actions.continuous_tensor - predicted_action) ** 2
sq_difference = torch.sum(sq_difference, dim=1)
return torch.mean(
ModelUtils.dynamic_partition(

else:
true_action = torch.cat(
ModelUtils.actions_to_onehot(
actions.discrete_tensor,
self._action_spec.discrete_branches,
actions.discrete_tensor, self._action_spec.discrete_branches
),
dim=1,
)

6
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


)
from mlagents.trainers.settings import GAILSettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import linear_layer, Initialization
from mlagents.trainers.settings import NetworkSettings, EncoderType

Creates the action Tensor. In continuous case, corresponds to the action. In
the discrete case, corresponds to the concatenation of one hot action Tensors.
"""
return self._action_flattener.forward(
torch.as_tensor(mini_batch["actions"], dtype=torch.float)
)
return self._action_flattener.forward(AgentAction.extract(mini_batch))
def get_state_inputs(
self, mini_batch: AgentBuffer

6
ml-agents/mlagents/trainers/torch/utils.py


else:
return sum(self._specs.discrete_branches)
def forward(self, action: torch.Tensor) -> torch.Tensor:
def forward(self, action: AgentAction) -> torch.Tensor:
return action
return action.continuous_tensor
torch.as_tensor(action, dtype=torch.long),
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,
),
dim=1,

正在加载...
取消
保存