比较提交

...
此合并请求有变更与目标分支冲突。
/ml-agents/mlagents/trainers/trainer_controller.py
/ml-agents/mlagents/trainers/ghost/trainer.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/sac/trainer.py
/ml-agents/mlagents/trainers/trainer/trainer.py
/ml-agents/mlagents/trainers/tests/test_ppo.py
/ml-agents/mlagents/trainers/tests/test_sac.py
/ml-agents/mlagents/trainers/tests/test_ghost.py
/ml-agents/mlagents/trainers/tests/test_simple_rl.py

4 次代码提交

共有 9 个文件被更改,包括 70 次插入82 次删除
  1. 6
      ml-agents/mlagents/trainers/trainer_controller.py
  2. 10
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 10
      ml-agents/mlagents/trainers/sac/trainer.py
  4. 2
      ml-agents/mlagents/trainers/trainer/trainer.py
  5. 35
      ml-agents/mlagents/trainers/ghost/trainer.py
  6. 47
      ml-agents/mlagents/trainers/tests/test_ppo.py
  7. 28
      ml-agents/mlagents/trainers/tests/test_sac.py
  8. 12
      ml-agents/mlagents/trainers/tests/test_ghost.py
  9. 2
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

6
ml-agents/mlagents/trainers/trainer_controller.py


trainer = self.trainer_factory.generate(brain_name)
self.trainers[brain_name] = trainer
policy = trainer.create_policy(env_manager.external_brains[name_behavior_id])
trainer.add_policy(parsed_behavior_id, policy)
trainer.add_policy(
parsed_behavior_id, env_manager.external_brains[name_behavior_id]
)
policy = trainer.get_policy(name_behavior_id)
agent_manager = AgentManager(
policy,

10
ml-agents/mlagents/trainers/ppo/trainer.py


self._check_param_keys()
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: TFPolicy = None # type: ignore
def _check_param_keys(self):
super()._check_param_keys()

return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.policy = self.create_policy(brain_parameters)
self.step = policy.get_current_step()
self.step = self.policy.get_current_step()
self.next_summary_step = self._get_next_summary_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:

10
ml-agents/mlagents/trainers/sac/trainer.py


self._check_param_keys()
self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.policy: TFPolicy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.step = 0

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> None:
"""
Adds policy to trainer.

self.__class__.__name__
)
)
if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.policy = policy
self.policy = self.create_policy(brain_parameters)
self.step = policy.get_current_step()
self.step = self.policy.get_current_step()
self.next_summary_step = self._get_next_summary_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:

2
ml-agents/mlagents/trainers/trainer/trainer.py


@abc.abstractmethod
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> None:
"""
Adds policy to trainer.

35
ml-agents/mlagents/trainers/ghost/trainer.py


def save_model(self, name_behavior_id: str) -> None:
"""
Forwarding call to wrapped trainers save_model
Loads the latest policy weights, saves it, then reloads
the current policy weights before resuming training.
policy = self.trainer.get_policy(brain_name)
reload_weights = policy.get_weights()
# save current snapshot to policy
policy.load_weights(self.current_policy_snapshot[brain_name])
self.trainer.save_model(name_behavior_id)
# reload
policy.load_weights(reload_weights)
self.trainer.save_model(brain_name)
First loads the latest snapshot.
policy = self.trainer.get_policy(brain_name)
policy.load_weights(self.current_policy_snapshot[brain_name])
self.trainer.export_model(brain_name)
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy:

return self.trainer.create_policy(brain_parameters)
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> None:
"""
Adds policy to trainer. The first policy encountered sets the wrapped

name_behavior_id = parsed_behavior_id.behavior_id
team_id = parsed_behavior_id.team_id
self.controller.subscribe_team_id(team_id, self)
policy = self.create_policy(brain_parameters)
policy.create_tf_graph()
policy.init_load_weights()
policy.create_tf_graph()
policy.init_load_weights()
# creates an internal trainer policy. This always contains the current learning policy
# parameterization and is the object the wrapped trainer uses to compute gradients.
self.trainer.add_policy(parsed_behavior_id, brain_parameters)
internal_trainer_policy = self.trainer.get_policy(
parsed_behavior_id.brain_name
)
internal_trainer_policy.create_tf_graph()
internal_trainer_policy.init_load_weights()
] = policy.get_weights()
] = internal_trainer_policy.get_weights()
# initialize ghost level policy to have the same weights
policy.load_weights(internal_trainer_policy.get_weights())
self.trainer.add_policy(parsed_behavior_id, policy)
self._learning_team = self.controller.get_learning_team
self.wrapped_trainer_team = team_id

47
ml-agents/mlagents/trainers/tests/test_ppo.py


)
@mock.patch("mlagents.trainers.ppo.trainer.NNPolicy")
def test_trainer_increment_step(ppo_optimizer, dummy_config):
def test_trainer_increment_step(ppo_optimizer, nn_policy, dummy_config):
mock_policy = mock.Mock()
mock_policy.get_current_step = mock.Mock(return_value=0)
step_count = (
5
) # 10 hacked because this function is no longer called through trainer
mock_policy.increment_step = mock.Mock(return_value=step_count)
nn_policy.return_value = mock_policy
brain_params = BrainParameters(
brain_name="test_brain",

trainer = PPOTrainer(
brain_params.brain_name, 0, trainer_params, True, False, 0, "0"
)
policy_mock = mock.Mock(spec=NNPolicy)
policy_mock.get_current_step.return_value = 0
step_count = (
5
) # 10 hacked because this function is no longer called through trainer
policy_mock.increment_step = mock.Mock(return_value=step_count)
trainer.add_policy("testbehavior", policy_mock)
trainer.add_policy("testbehavior", brain_params)
policy = trainer.get_policy("testbehavior")
policy_mock.increment_step.assert_called_with(5)
policy.increment_step.assert_called_with(5)
@pytest.mark.parametrize("use_discrete", [True, False])

trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128
trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.add_policy(mock_brain.brain_name, mock_brain)
# Test update with sequence length smaller than batch size
buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
# Mock out reward signal eval

dummy_config["summary_path"] = "./summaries/test_trainer_summary"
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer.add_policy(brain_params.brain_name, brain_params)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)
time_horizon = 15

assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
@mock.patch("mlagents.trainers.ppo.trainer.NNPolicy")
def test_add_get_policy(ppo_optimizer, dummy_config):
def test_add_get_policy(ppo_optimizer, nn_policy, dummy_config):
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)

mock_policy = mock.Mock()
mock_policy.get_current_step = mock.Mock(return_value=2000)
nn_policy.return_value = mock_policy
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000
trainer.add_policy(brain_params.brain_name, policy)
assert trainer.get_policy(brain_params.brain_name) == policy
trainer.add_policy(brain_params.brain_name, brain_params)
# Test incorrect class of policy
policy = mock.Mock()
with pytest.raises(RuntimeError):
trainer.add_policy(brain_params, policy)
def test_bad_config(dummy_config):

28
ml-agents/mlagents/trainers/tests/test_sac.py


trainer_params["model_path"] = str(tmpdir)
trainer_params["save_replay_buffer"] = True
trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0)
policy = trainer.create_policy(mock_brain)
trainer.add_policy(mock_brain.brain_name, policy)
trainer.add_policy(mock_brain.brain_name, mock_brain)
policy = trainer.get_policy(mock_brain.brain_name)
trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain)
buffer_len = trainer.update_buffer.num_experiences

trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0)
policy = trainer2.create_policy(mock_brain)
trainer2.add_policy(mock_brain.brain_name, policy)
trainer2.add_policy(mock_brain.brain_name, mock_brain)
@mock.patch("mlagents.trainers.sac.trainer.NNPolicy")
def test_add_get_policy(sac_optimizer, dummy_config):
def test_add_get_policy(sac_optimizer, nn_policy, dummy_config):
brain_params = make_brain_parameters(
discrete_action=False, visual_inputs=0, vec_obs_size=6
)

mock_policy = mock.Mock()
mock_policy.get_current_step = mock.Mock(return_value=2000)
nn_policy.return_value = mock_policy
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = mock.Mock(spec=NNPolicy)
policy.get_current_step.return_value = 2000
trainer.add_policy(brain_params.brain_name, policy)
assert trainer.get_policy(brain_params.brain_name) == policy
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
trainer.add_policy(brain_params.brain_name, brain_params)
# Test incorrect class of policy
policy = mock.Mock()
with pytest.raises(RuntimeError):
trainer.add_policy(brain_params, policy)
def test_process_trajectory(dummy_config):
brain_params = make_brain_parameters(

dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
policy = trainer.create_policy(brain_params)
trainer.add_policy(brain_params.brain_name, policy)
trainer.add_policy(brain_params.brain_name, brain_params)
trajectory_queue = AgentManagerQueue("testbrain")
trainer.subscribe_trajectory_queue(trajectory_queue)

12
ml-agents/mlagents/trainers/tests/test_ghost.py


)
# first policy encountered becomes policy trained by wrapped PPO
policy = trainer.create_policy(brain_params_team0)
trainer.add_policy(parsed_behavior_id0, policy)
trainer.add_policy(parsed_behavior_id0, brain_params_team0)
policy = trainer.create_policy(brain_params_team1)
trainer.add_policy(parsed_behavior_id1, policy)
trainer.add_policy(parsed_behavior_id1, brain_params_team1)
trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.subscribe_trajectory_queue(trajectory_queue1)

# First policy encountered becomes policy trained by wrapped PPO
# This queue should remain empty after swap snapshot
policy = trainer.create_policy(brain_params_team0)
trainer.add_policy(parsed_behavior_id0, policy)
trainer.add_policy(parsed_behavior_id0, brain_params_team0)
policy = trainer.create_policy(brain_params_team1)
trainer.add_policy(parsed_behavior_id1, policy)
trainer.add_policy(parsed_behavior_id1, brain_params_team1)
policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
trainer.publish_policy_queue(policy_queue1)

2
ml-agents/mlagents/trainers/tests/test_simple_rl.py


processed_rewards = [
default_reward_processor(rewards) for rewards in env.final_rewards.values()
]
success_threshold = 0.99
success_threshold = 0.9
assert any(reward > success_threshold for reward in processed_rewards) and any(
reward < success_threshold for reward in processed_rewards
)

正在加载...
取消
保存