浏览代码

Support tf and pytorch alongside one another

/develop/add-fire
Arthur Juliani 5 年前
当前提交
ca887743
共有 10 个文件被更改,包括 82 次插入32 次删除
  1. 7
      ml-agents/mlagents/trainers/optimizer/optimizer.py
  2. 9
      ml-agents/mlagents/trainers/policy/policy.py
  3. 9
      ml-agents/mlagents/trainers/policy/tf_policy.py
  4. 2
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  5. 10
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  6. 49
      ml-agents/mlagents/trainers/ppo/trainer.py
  7. 4
      ml-agents/mlagents/trainers/tests/test_ppo.py
  8. 4
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  9. 6
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  10. 14
      ml-agents/mlagents/trainers/trainer/trainer.py

7
ml-agents/mlagents/trainers/optimizer/optimizer.py


import abc
class Optimizer(abc.ABC):
class Optimizer(object):
@abc.abstractmethod
def __init__(self):
self.reward_signals = {}
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Update the Policy based on the batch that was passed in.

9
ml-agents/mlagents/trainers/policy/policy.py


self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_params["normalize"]
self.use_recurrent = trainer_params["use_recurrent"]
self.model_path = trainer_params["model_path"]
if self.use_recurrent:
self.m_size = trainer_params["memory_size"]

raise NotImplementedError
@abstractmethod
def update_normalization(self, vector_obs: np.ndarray) -> None:
pass
@abstractmethod
def export_model(self, step=0):
pass

@abstractmethod
def increment_step(self, n_steps):
pass
@abstractmethod
def get_current_step(self):
pass

9
ml-agents/mlagents/trainers/policy/tf_policy.py


:param brain: The corresponding Brain for this policy.
:param trainer_parameters: The trainer parameters.
"""
super(TFPolicy, self).__init__(brain, seed)
super(TFPolicy, self).__init__(
brain=brain, seed=seed, trainer_params=trainer_parameters
)
self._version_number_ = 2
self.m_size = 0

self.vec_obs_size = brain.vector_observation_space_size
self.vis_obs_size = brain.number_visual_observations
self.model_path = trainer_parameters["model_path"]
self.initialize_path = trainer_parameters.get("init_path", None)
self.keep_checkpoints = trainer_parameters.get("keep_checkpoints", 5)
self.graph = tf.Graph()

Builds the tensorflow graph needed for this policy.
"""
pass
def load_model(self, step=0):
reset_steps = not self.load
self._load_graph(self.model_path, reset_global_steps=reset_steps)
def _initialize_graph(self):
with self.graph.as_default():

2
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


from mlagents.trainers.buffer import AgentBuffer
class PPOOptimizer(TFOptimizer):
class TFPPOOptimizer(TFOptimizer):
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.

10
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
class PPOOptimizer(TorchOptimizer):
class TorchPPOOptimizer(TorchOptimizer):
def __init__(self, policy: TorchPolicy, trainer_params: Dict[str, Any]):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.

"""
# Create the graph here to give more granular control of the TF graph to the Optimizer.
super(PPOOptimizer, self).__init__(policy, trainer_params)
super(TorchPPOOptimizer, self).__init__(policy, trainer_params)
params = list(self.policy.actor_critic.parameters())
self.optimizer = torch.optim.Adam(

vec_obs = np.array(batch["vector_obs"])
vec_obs = [torch.Tensor(vec_obs)]
act_masks = torch.Tensor(np.array(batch["action_mask"]))
actions = [torch.Tensor(np.array(batch["actions"]))]
if self.policy.use_continuous_act:
actions = [torch.Tensor(np.array(batch["actions"]))]
else:
actions = list(torch.Tensor(np.array(batch["actions"])).permute(1, 0))
memories = [
torch.Tensor(np.array(batch["memory"][i]))
for i in range(0, len(batch["memory"]), self.policy.sequence_length)

49
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.ppo.optimizer_torch import PPOOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
logger = get_logger(__name__)

self._check_param_keys()
self.load = load
self.seed = seed
self.policy: TorchPolicy = None # type: ignore
self.framework = "torch"
self.policy: Policy = None # type: ignore
def _check_param_keys(self):
super()._check_param_keys()

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> Policy:
if self.framework == "torch":
return self.create_torch_policy(parsed_behavior_id, brain_parameters)
else:
return self.create_tf_policy(parsed_behavior_id, brain_parameters)
def create_tf_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> NNPolicy:
"""
Creates a PPO policy to trainers list of policies.
:param parsed_behavior_id:
:param brain_parameters: specifications for policy construction
:return policy
"""
policy = NNPolicy(
self.seed,
brain_parameters,
self.trainer_parameters,
self.is_training,
self.load,
condition_sigma_on_obs=False, # Faster training for PPO
)
return policy
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TorchPolicy:
"""
Creates a PPO policy to trainers list of policies.

self.load,
condition_sigma_on_obs=False, # Faster training for PPO
)
self, parsed_behavior_id: BehaviorIdentifiers, policy: TorchPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

if not isinstance(policy, Policy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
if self.framework == "torch":
self.optimizer = TorchPPOOptimizer( # type: ignore
self.policy, self.trainer_parameters # type: ignore
) # type: ignore
else:
self.optimizer = TFPPOOptimizer( # type: ignore
self.policy, self.trainer_parameters # type: ignore
) # type: ignore
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

def get_policy(self, name_behavior_id: str) -> TorchPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

4
ml-agents/mlagents/trainers/tests/test_ppo.py


import yaml
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.agent_processor import AgentManagerQueue

policy = NNPolicy(
0, mock_brain, trainer_parameters, False, False, create_tf_graph=False
)
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = TFPPOOptimizer(policy, trainer_parameters)
return optimizer

4
ml-agents/mlagents/trainers/tests/test_reward_signals.py


import mlagents.trainers.tests.mock_brain as mb
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.ppo.optimizer_tf import TFPPOOptimizer
CONTINUOUS_PATH = os.path.dirname(os.path.abspath(__file__)) + "/test.demo"
DISCRETE_PATH = os.path.dirname(os.path.abspath(__file__)) + "/testdcvis.demo"

if trainer_parameters["trainer"] == "sac":
optimizer = SACOptimizer(policy, trainer_parameters)
else:
optimizer = PPOOptimizer(policy, trainer_parameters)
optimizer = TFPPOOptimizer(policy, trainer_parameters)
return optimizer

6
ml-agents/mlagents/trainers/trainer/rl_trainer.py


import abc
import time
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.optimizer.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(
self, agent_id: str, optimizer: TorchOptimizer
) -> None:
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(

14
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.policy.policy import Policy
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers

self._stats_reporter = StatsReporter(self.summary_path)
self.is_training = training
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy_queues: List[AgentManagerQueue[TorchPolicy]] = []
self.policy_queues: List[AgentManagerQueue[Policy]] = []
self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.step: int = 0
self.summary_freq = self.trainer_parameters["summary_freq"]

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TorchPolicy:
) -> Policy:
"""
Creates policy
"""

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TorchPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TorchPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer.
"""

"""
pass
def publish_policy_queue(
self, policy_queue: AgentManagerQueue[TorchPolicy]
) -> None:
def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
"""
Adds a policy queue to the list of queues to publish to when this Trainer
makes a policy update

正在加载...
取消
保存