浏览代码

Merge pull request #2912 from Unity-Technologies/develop-allbraininfo

Bubbled up indexing of AllBrainInfo to trainer controller from trainers
/develop-newnormalization
GitHub 5 年前
当前提交
c0453ae1
共有 9 个文件被更改,包括 96 次插入95 次删除
  1. 41
      ml-agents/mlagents/trainers/bc/trainer.py
  2. 27
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 13
      ml-agents/mlagents/trainers/rl_trainer.py
  4. 23
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 22
      ml-agents/mlagents/trainers/tests/test_bc.py
  6. 6
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  7. 35
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  8. 16
      ml-agents/mlagents/trainers/trainer.py
  9. 8
      ml-agents/mlagents/trainers/trainer_controller.py

41
ml-agents/mlagents/trainers/bc/trainer.py


import numpy as np
from mlagents.envs.brain import AllBrainInfo
from mlagents.envs.brain import BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.trainers.bc.policy import BCPolicy
from mlagents.trainers.buffer import Buffer

def add_experiences(
self,
curr_all_info: AllBrainInfo,
next_all_info: AllBrainInfo,
curr_info: BrainInfo,
next_info: BrainInfo,
:param curr_all_info: Current AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param next_all_info: Next AllBrainInfo (Dictionary of all current brains and corresponding BrainInfo).
:param curr_info: Current BrainInfo
:param next_info: Next BrainInfo
info_student = curr_all_info[self.brain_name]
next_info_student = next_all_info[self.brain_name]
for agent_id in info_student.agents:
self.evaluation_buffer[agent_id].last_brain_info = info_student
for agent_id in curr_info.agents:
self.evaluation_buffer[agent_id].last_brain_info = curr_info
for agent_id in next_info_student.agents:
stored_info_student = self.evaluation_buffer[agent_id].last_brain_info
if stored_info_student is None:
for agent_id in next_info.agents:
stored_next_info = self.evaluation_buffer[agent_id].last_brain_info
if stored_next_info is None:
next_idx = next_info_student.agents.index(agent_id)
next_idx = next_info.agents.index(agent_id)
self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx]
if not next_info_student.local_done[next_idx]:
self.cumulative_rewards[agent_id] += next_info.rewards[next_idx]
if not next_info.local_done[next_idx]:
self, current_info: AllBrainInfo, next_info: AllBrainInfo
self, current_info: BrainInfo, next_info: BrainInfo
:param current_info: Current AllBrainInfo
:param next_info: Next AllBrainInfo
:param current_info: Current BrainInfo
:param next_info: Next BrainInfo
info_student = next_info[self.brain_name]
for l in range(len(info_student.agents)):
if info_student.local_done[l]:
agent_id = info_student.agents[l]
for l in range(len(next_info.agents)):
if next_info.local_done[l]:
agent_id = next_info.agents[l]
self.stats["Environment/Cumulative Reward"].append(
self.cumulative_rewards.get(agent_id, 0)
)

27
ml-agents/mlagents/trainers/ppo/trainer.py


import numpy as np
from mlagents.envs.brain import AllBrainInfo
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput

self.collected_rewards[_reward_signal] = {}
def process_experiences(
self, current_info: AllBrainInfo, next_info: AllBrainInfo
self, current_info: BrainInfo, next_info: BrainInfo
:param current_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_info: Dictionary of all next brains and corresponding BrainInfo.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
info = next_info[self.brain_name]
self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]["actions"]
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.training_buffer[next_info.agents[l]]["actions"]
info.local_done[l]
next_info.local_done[l]
agent_id = info.agents[l]
if info.max_reached[l]:
agent_id = next_info.agents[l]
if next_info.max_reached[l]:
bootstrapping_info = info
bootstrapping_info = next_info
info.local_done[l] and not info.max_reached[l],
next_info.local_done[l] and not next_info.max_reached[l],
)
tmp_advantages = []

)
self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)

13
ml-agents/mlagents/trainers/rl_trainer.py


from typing import Dict, List, Any, NamedTuple
import numpy as np
from mlagents.envs.brain import AllBrainInfo, BrainInfo
from mlagents.envs.brain import BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.trainer import Trainer, UnityTrainerException

def add_experiences(
self,
curr_all_info: AllBrainInfo,
next_all_info: AllBrainInfo,
curr_info: BrainInfo,
next_info: BrainInfo,
:param curr_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.
"""
self.trainer_metrics.start_experience_collection_timer()

self.stats[signal.value_name].append(
np.mean(take_action_outputs["value_heads"][name])
)
curr_info = curr_all_info[self.brain_name]
next_info = next_all_info[self.brain_name]
for agent_id in curr_info.agents:
self.training_buffer[agent_id].last_brain_info = curr_info

23
ml-agents/mlagents/trainers/sac/trainer.py


import numpy as np
from mlagents.envs.brain import AllBrainInfo
from mlagents.envs.brain import BrainInfo
from mlagents.envs.action_info import ActionInfoOutputs
from mlagents.envs.timers import timed
from mlagents.trainers.sac.policy import SACPolicy

)
def process_experiences(
self, current_info: AllBrainInfo, next_info: AllBrainInfo
self, current_info: BrainInfo, next_info: BrainInfo
:param current_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_info: Dictionary of all next brains and corresponding BrainInfo.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
info = next_info[self.brain_name]
self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]["actions"]
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.training_buffer[next_info.agents[l]]["actions"]
info.local_done[l]
next_info.local_done[l]
agent_id = info.agents[l]
agent_id = next_info.agents[l]
if info.max_reached[l]:
if next_info.max_reached[l]:
bootstrapping_info = self.training_buffer[agent_id].last_brain_info
idx = bootstrapping_info.agents.index(agent_id)
for i, obs in enumerate(bootstrapping_info.visual_observations):

)
self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)

22
ml-agents/mlagents/trainers/tests/test_bc.py


trainer, env = create_bc_trainer(dummy_config)
# Test add_experiences
returned_braininfo = env.step()
brain_name = "Ball3DBrain"
returned_braininfo, returned_braininfo, {}
returned_braininfo[brain_name], returned_braininfo[brain_name], {}
for agent_id in returned_braininfo["Ball3DBrain"].agents:
for agent_id in returned_braininfo[brain_name].agents:
returned_braininfo["Ball3DBrain"].local_done = 12 * [True]
trainer.process_experiences(returned_braininfo, returned_braininfo)
for agent_id in returned_braininfo["Ball3DBrain"].agents:
returned_braininfo[brain_name].local_done = 12 * [True]
trainer.process_experiences(
returned_braininfo[brain_name], returned_braininfo[brain_name]
)
for agent_id in returned_braininfo[brain_name].agents:
assert trainer.episode_steps[agent_id] == 0
assert trainer.cumulative_rewards[agent_id] == 0

returned_braininfo = env.step()
brain_name = "Ball3DBrain"
returned_braininfo, returned_braininfo, {}
returned_braininfo[brain_name], returned_braininfo[brain_name], {}
trainer.process_experiences(returned_braininfo, returned_braininfo)
trainer.process_experiences(
returned_braininfo[brain_name], returned_braininfo[brain_name]
)
for agent_id in returned_braininfo["Ball3DBrain"].agents:
for agent_id in returned_braininfo[brain_name].agents:
assert trainer.episode_steps[agent_id] == 0
assert trainer.cumulative_rewards[agent_id] == 0

6
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


num_vector_acts=2,
num_vis_observations=num_vis_obs,
)
trainer.add_experiences(
create_mock_all_brain_info(mock_braininfo),
create_mock_all_brain_info(mock_braininfo),
fake_action_outputs,
)
trainer.add_experiences(mock_braininfo, mock_braininfo, fake_action_outputs)
# Remove one of the agents
next_mock_braininfo = mb.create_mock_braininfo(

35
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


def test_take_step_adds_experiences_to_trainer_and_trains():
tc, trainer_mock = trainer_controller_with_take_step_mocks()
action_info_dict = {"testbrain": MagicMock()}
brain_name = "testbrain"
action_info_dict = {brain_name: MagicMock()}
old_step_info = EnvironmentStep(Mock(), Mock(), action_info_dict)
new_step_info = EnvironmentStep(Mock(), Mock(), action_info_dict)
brain_info_dict = {brain_name: Mock()}
old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
trainer_mock.is_ready_update = MagicMock(return_value=True)
env_mock = MagicMock()

env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
trainer_mock.add_experiences.assert_called_once_with(
new_step_info.previous_all_brain_info,
new_step_info.current_all_brain_info,
new_step_info.brain_name_to_action_info["testbrain"].outputs,
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
new_step_info.brain_name_to_action_info[brain_name].outputs,
new_step_info.previous_all_brain_info, new_step_info.current_all_brain_info
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
)
trainer_mock.update_policy.assert_called_once()
trainer_mock.increment_step.assert_called_once()

tc, trainer_mock = trainer_controller_with_take_step_mocks()
tc.train_model = False
action_info_dict = {"testbrain": MagicMock()}
brain_name = "testbrain"
action_info_dict = {brain_name: MagicMock()}
old_step_info = EnvironmentStep(Mock(), Mock(), action_info_dict)
new_step_info = EnvironmentStep(Mock(), Mock(), action_info_dict)
brain_info_dict = {brain_name: Mock()}
old_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
new_step_info = EnvironmentStep(brain_info_dict, brain_info_dict, action_info_dict)
trainer_mock.is_ready_update = MagicMock(return_value=False)
env_mock = MagicMock()

env_mock.reset.assert_not_called()
env_mock.step.assert_called_once()
trainer_mock.add_experiences.assert_called_once_with(
new_step_info.previous_all_brain_info,
new_step_info.current_all_brain_info,
new_step_info.brain_name_to_action_info["testbrain"].outputs,
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
new_step_info.brain_name_to_action_info[brain_name].outputs,
new_step_info.previous_all_brain_info, new_step_info.current_all_brain_info
new_step_info.previous_all_brain_info[brain_name],
new_step_info.current_all_brain_info[brain_name],
)
trainer_mock.clear_update_buffer.assert_called_once()

16
ml-agents/mlagents/trainers/trainer.py


from mlagents.envs.timers import set_gauge
from mlagents.trainers.trainer_metrics import TrainerMetrics
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.envs.brain import BrainParameters, AllBrainInfo
from mlagents.envs.brain import BrainParameters, BrainInfo
LOGGER = logging.getLogger("mlagents.trainers")

def add_experiences(
self,
curr_all_info: AllBrainInfo,
next_all_info: AllBrainInfo,
curr_info: BrainInfo,
next_info: BrainInfo,
:param curr_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
self, current_info: AllBrainInfo, next_info: AllBrainInfo
self, current_info: BrainInfo, next_info: BrainInfo
:param current_info: Dictionary of all current-step brains and corresponding BrainInfo.
:param next_info: Dictionary of all next-step brains and corresponding BrainInfo.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
"""
raise UnityTrainerException(
"The process_experiences method was not implemented."

8
ml-agents/mlagents/trainers/trainer_controller.py


self.trainer_metrics[brain_name].add_delta_step(delta_time_step)
if brain_name in step_info.brain_name_to_action_info:
trainer.add_experiences(
step_info.previous_all_brain_info,
step_info.current_all_brain_info,
step_info.previous_all_brain_info[brain_name],
step_info.current_all_brain_info[brain_name],
step_info.previous_all_brain_info,
step_info.current_all_brain_info,
step_info.previous_all_brain_info[brain_name],
step_info.current_all_brain_info[brain_name],
)
for brain_name, trainer in self.trainers.items():
if brain_name in self.trainer_metrics:

正在加载...
取消
保存