浏览代码

Move 'take_action' into Policy class (#1669)

* Move 'take_action' into Policy class

This refactor is part of Actor-Trainer separation. Since policies
will be distributed across actors in separate processes which share
a single trainer, taking an action should be the responsibility of
the policy.

This change makes a few smaller changes:
* Combines `take_action` logic between trainers, making it more
  generic
* Adds an `ActionInfo` data class to be more explicit about the
  data returned by the policy, only used by TrainerController and
  policy for now.
* Moves trainer stats logic out of `take_action` and into
  `add_experiences`
* Renames 'take_action' to 'get_action'
/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
c258b1c3
共有 9 个文件被更改,包括 146 次插入87 次删除
  1. 6
      ml-agents/mlagents/trainers/__init__.py
  2. 17
      ml-agents/mlagents/trainers/bc/trainer.py
  3. 24
      ml-agents/mlagents/trainers/policy.py
  4. 29
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 14
      ml-agents/mlagents/trainers/trainer.py
  6. 47
      ml-agents/mlagents/trainers/trainer_controller.py
  7. 36
      ml-agents/tests/trainers/test_trainer_controller.py
  8. 9
      ml-agents/mlagents/trainers/action_info.py
  9. 51
      ml-agents/tests/trainers/test_policy.py

6
ml-agents/mlagents/trainers/__init__.py


from .action_info import *
from .trainer import *
from .policy import *
from .trainer_controller import *
from .bc.models import *
from .bc.offline_trainer import *

from .ppo.trainer import *
from .ppo.policy import *
from .exception import *
from .policy import *
from .demo_loader import *
from .demo_loader import *

17
ml-agents/mlagents/trainers/bc/trainer.py


self.policy.increment_step()
return
def take_action(self, all_brain_info: AllBrainInfo):
"""
Decides actions using policy given current brain info.
:param all_brain_info: AllBrainInfo from environment.
:return: a tuple containing action, memories, values and an object
to be passed to add experiences
"""
if len(all_brain_info[self.brain_name].agents) == 0:
return [], [], [], None, None
agent_brain = all_brain_info[self.brain_name]
run_out = self.policy.evaluate(agent_brain)
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, None, None
else:
return run_out['action'], None, None, None, None
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,
take_action_outputs):
"""

24
ml-agents/mlagents/trainers/policy.py


import numpy as np
import tensorflow as tf
from mlagents.trainers import UnityException
from mlagents.trainers import ActionInfo, UnityException
from mlagents.envs import BrainInfo
logger = logging.getLogger("mlagents.trainers")

.format(self.model_path))
self.saver.restore(self.sess, ckpt.model_checkpoint_path)
def evaluate(self, brain_info):
def evaluate(self, brain_info: BrainInfo):
"""
Evaluates policy for the agent experiences provided.
:param brain_info: BrainInfo input to network.

def get_action(self, brain_info: BrainInfo) -> ActionInfo:
"""
Decides actions given observations information, and takes them in environment.
:param brain_info: A dictionary of brain names and BrainInfo from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""
if len(brain_info.agents) == 0:
return ActionInfo([], [], [], None, None)
run_out = self.evaluate(brain_info)
return ActionInfo(
action=run_out.get('action'),
memory=run_out.get('memory_out'),
text=None,
value=run_out.get('value'),
outputs=run_out
)
def update(self, mini_batch, num_sequences):
"""

29
ml-agents/mlagents/trainers/ppo/trainer.py


self.policy.increment_step()
self.step = self.policy.get_current_step()
def take_action(self, all_brain_info: AllBrainInfo):
"""
Decides actions given observations information, and takes them in environment.
:param all_brain_info: A dictionary of brain names and BrainInfo from environment.
:return: a tuple containing action, memories, values and an object
to be passed to add experiences
"""
curr_brain_info = all_brain_info[self.brain_name]
if len(curr_brain_info.agents) == 0:
return [], [], [], None, None
run_out = self.policy.evaluate(curr_brain_info)
self.stats['Policy/Value Estimate'].append(run_out['value'].mean())
self.stats['Policy/Entropy'].append(run_out['entropy'].mean())
self.stats['Policy/Learning Rate'].append(run_out['learning_rate'])
if self.policy.use_recurrent:
return run_out['action'], run_out['memory_out'], None, \
run_out['value'], run_out
else:
return run_out['action'], None, None, run_out['value'], run_out
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
"""
Constructs a BrainInfo which contains the most recent previous experiences for all agents info

Adds experiences to each agent's experience history.
:param curr_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param take_action_outputs: The outputs of the take action method.
:param take_action_outputs: The outputs of the Policy's get_action method.
if take_action_outputs:
self.stats['Policy/Value Estimate'].append(take_action_outputs['value'].mean())
self.stats['Policy/Entropy'].append(take_action_outputs['entropy'].mean())
self.stats['Policy/Learning Rate'].append(take_action_outputs['learning_rate'])
curr_info = curr_all_info[self.brain_name]
next_info = next_all_info[self.brain_name]

14
ml-agents/mlagents/trainers/trainer.py


import tensorflow as tf
import numpy as np
from mlagents.envs import UnityException, AllBrainInfo
from mlagents.envs import UnityException, AllBrainInfo, BrainInfo
from mlagents.trainers import ActionInfo
logger = logging.getLogger("mlagents.trainers")

raise UnityTrainerException(
"The increment_step_and_update_last_reward method was not implemented.")
def take_action(self, all_brain_info: AllBrainInfo):
def get_action(self, curr_info: BrainInfo) -> ActionInfo:
Decides actions given state/observation information, and takes them in environment.
:param all_brain_info: A dictionary of brain names and BrainInfo from environment.
:return: a tuple containing action, memories, values and an object
to be passed to add experiences
Get an action using this trainer's current policy.
:param curr_info: Current BrainInfo.
:return: The ActionInfo given by the policy given the BrainInfo.
raise UnityTrainerException("The take_action method was not implemented.")
return self.policy.get_action(curr_info)
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo,
take_action_outputs):

47
ml-agents/mlagents/trainers/trainer_controller.py


import numpy as np
import tensorflow as tf
from mlagents.envs import BrainInfo
from mlagents.envs import AllBrainInfo, BrainInfo
from mlagents.trainers import Trainer, Policy
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer
from mlagents.trainers.bc.online_trainer import OnlineBCTrainer

self.load_model = load
self.train_model = train
self.keep_checkpoints = keep_checkpoints
self.trainers = {}
self.trainers: Dict[str, Trainer] = {}
self.global_step = 0
self.meta_curriculum = meta_curriculum
self.seed = training_seed

for _, t in self.trainers.items():
self.logger.info(t)
curr_info = self._reset_env(env)
if self.train_model:
for brain_name, trainer in self.trainers.items():
trainer.write_tensorboard_text('Hyperparameters',

win32api.SetConsoleCtrlHandler(self._win_handler, True)
try:
curr_info = self._reset_env(env)
while any([t.get_step <= t.get_max_steps \
for k, t in self.trainers.items()]) \
or not self.train_model:

if self.train_model:
self._export_graph()
def take_step(self, env, curr_info):
def take_step(self, env, curr_info: AllBrainInfo):
reward_buff_sizes = {k: len(t.reward_buffer) \
reward_buff_sizes = {k: len(t.reward_buffer)
for (k, t) in self.trainers.items()}
# Attempt to increment the lessons of the brains who
# were ready.

reward_buff_sizes=reward_buff_sizes)
else:
lessons_incremented = {}
# If any lessons were incremented or the environment is
# ready to be reset

trainer.end_episode()
# Decide and take an action
take_action_vector, \
take_action_memories, \
take_action_text, \
take_action_value, \
take_action_outputs \
= {}, {}, {}, {}, {}
take_action_vector = {}
take_action_memories = {}
take_action_text = {}
take_action_value = {}
take_action_outputs = {}
(take_action_vector[brain_name],
take_action_memories[brain_name],
take_action_text[brain_name],
take_action_value[brain_name],
take_action_outputs[brain_name]) = \
trainer.take_action(curr_info)
new_info = env.step(vector_action=take_action_vector,
memory=take_action_memories,
text_action=take_action_text,
value=take_action_value)
action_info = trainer.get_action(curr_info[brain_name])
take_action_vector[brain_name] = action_info.action
take_action_memories[brain_name] = action_info.memory
take_action_text[brain_name] = action_info.text
take_action_value[brain_name] = action_info.value
take_action_outputs[brain_name] = action_info.outputs
new_info = env.step(
vector_action=take_action_vector,
memory=take_action_memories,
text_action=take_action_text,
value=take_action_value
)
for brain_name, trainer in self.trainers.items():
trainer.add_experiences(curr_info, new_info,
take_action_outputs[brain_name])

36
ml-agents/tests/trainers/test_trainer_controller.py


import yaml
import pytest
from mlagents.trainers import ActionInfo
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.bc.offline_trainer import OfflineBCTrainer

tc, trainer_mock = trainer_controller_with_take_step_mocks()
brain_info_mock = MagicMock()
action_data_mock_out = [None, None, None, None, None]
trainer_mock.take_action = MagicMock(return_value=action_data_mock_out)
trainer_mock.add_experiences = MagicMock()
trainer_mock.process_experiences = MagicMock()
trainer_mock.update_policy = MagicMock()

env_mock.reset = MagicMock(return_value=brain_info_mock)
env_mock.global_done = True
trainer_mock.get_action = MagicMock(return_value = ActionInfo(None, None, None, None, None))
tc.take_step(env_mock, brain_info_mock)
env_mock.reset.assert_called_once()

curr_info_mock = MagicMock()
trainer_action_output_mock = [
'action',
'memory',
'actiontext',
'value',
'output',
]
trainer_mock.take_action = MagicMock(return_value=trainer_action_output_mock)
brain_info_mock = MagicMock()
curr_info_mock.__getitem__ = MagicMock(return_value=brain_info_mock)
trainer_mock.is_ready_update = MagicMock(return_value=True)
env_mock = MagicMock()

env_mock.reset = MagicMock(return_value=curr_info_mock)
env_mock.global_done = False
action_output_mock = ActionInfo(
'action',
'memory',
'actiontext',
'value',
{'some': 'output'}
)
trainer_mock.get_action = MagicMock(return_value=action_output_mock)
trainer_mock.take_action.assert_called_once_with(curr_info_mock)
trainer_mock.get_action.assert_called_once_with(brain_info_mock)
vector_action={'testbrain': trainer_action_output_mock[0]},
memory={'testbrain': trainer_action_output_mock[1]},
text_action={'testbrain': trainer_action_output_mock[2]},
value={'testbrain': trainer_action_output_mock[3]}
vector_action={'testbrain': action_output_mock.action},
memory={'testbrain': action_output_mock.memory},
text_action={'testbrain': action_output_mock.text},
value={'testbrain': action_output_mock.value}
curr_info_mock, env_step_output_mock, trainer_action_output_mock[4]
curr_info_mock, env_step_output_mock, action_output_mock.outputs
)
trainer_mock.process_experiences.assert_called_once_with(curr_info_mock, env_step_output_mock)
trainer_mock.update_policy.assert_called_once()

9
ml-agents/mlagents/trainers/action_info.py


from typing import NamedTuple, Any, Dict, Optional
class ActionInfo(NamedTuple):
action: Any
memory: Any
text: Any
value: Any
outputs: Optional[Dict[str, Any]]

51
ml-agents/tests/trainers/test_policy.py


from mlagents.trainers.policy import *
from unittest.mock import MagicMock
def basic_mock_brain():
mock_brain = MagicMock()
mock_brain.vector_action_space_type = "continuous"
return mock_brain
def basic_params():
return {
"use_recurrent": False,
"model_path": "my/path"
}
def test_take_action_returns_empty_with_no_agents():
test_seed = 3
policy = Policy(test_seed, basic_mock_brain(), basic_params())
no_agent_brain_info = BrainInfo([], [], [], agents=[])
result = policy.get_action(no_agent_brain_info)
assert(result == ActionInfo([], [], [], None, None))
def test_take_action_returns_nones_on_missing_values():
test_seed = 3
policy = Policy(test_seed, basic_mock_brain(), basic_params())
policy.evaluate = MagicMock(return_value={})
brain_info_with_agents = BrainInfo([], [], [], agents=['an-agent-id'])
result = policy.get_action(brain_info_with_agents)
assert(result == ActionInfo(None, None, None, None, {}))
def test_take_action_returns_action_info_when_available():
test_seed = 3
policy = Policy(test_seed, basic_mock_brain(), basic_params())
policy_eval_out = {
'action': np.array([1.0]),
'memory_out': np.array([2.5]),
'value': np.array([1.1])
}
policy.evaluate = MagicMock(return_value=policy_eval_out)
brain_info_with_agents = BrainInfo([], [], [], agents=['an-agent-id'])
result = policy.get_action(brain_info_with_agents)
expected = ActionInfo(
policy_eval_out['action'],
policy_eval_out['memory_out'],
None,
policy_eval_out['value'],
policy_eval_out
)
assert (result == expected)
正在加载...
取消
保存