浏览代码

[refactor] Make classes except Optimizer framework agnostic (#4268)

/MLA-1734-demo-provider
GitHub 4 年前
当前提交
beb5aca5
共有 11 个文件被更改,包括 46 次插入84 次删除
  1. 5
      ml-agents/mlagents/trainers/agent_processor.py
  2. 6
      ml-agents/mlagents/trainers/env_manager.py
  3. 11
      ml-agents/mlagents/trainers/ghost/trainer.py
  4. 3
      ml-agents/mlagents/trainers/optimizer/optimizer.py
  5. 2
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  6. 12
      ml-agents/mlagents/trainers/policy/policy.py
  7. 56
      ml-agents/mlagents/trainers/policy/tf_policy.py
  8. 11
      ml-agents/mlagents/trainers/ppo/trainer.py
  9. 11
      ml-agents/mlagents/trainers/sac/trainer.py
  10. 4
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  11. 9
      ml-agents/mlagents/trainers/trainer/trainer.py

5
ml-agents/mlagents/trainers/agent_processor.py


EnvironmentStats,
)
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
from mlagents.trainers.stats import StatsReporter

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

def __init__(
self,
policy: TFPolicy,
policy: Policy,
behavior_id: str,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,

6
ml-agents/mlagents/trainers/env_manager.py


)
from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
from mlagents.trainers.action_info import ActionInfo
from mlagents_envs.logging_util import get_logger

class EnvManager(ABC):
def __init__(self):
self.policies: Dict[BehaviorName, TFPolicy] = {}
self.policies: Dict[BehaviorName, Policy] = {}
def set_policy(self, brain_name: BehaviorName, policy: TFPolicy) -> None:
def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None:
self.policies[brain_name] = policy
if brain_name in self.agent_managers:
self.agent_managers[brain_name].policy = policy

11
ml-agents/mlagents/trainers/ghost/trainer.py


# ## ML-Agent Learning (Ghost Trainer)
from collections import defaultdict
from typing import Deque, Dict, DefaultDict, List, cast
from typing import Deque, Dict, DefaultDict, List
import numpy as np

from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trajectory import Trajectory

for brain_name in self._internal_policy_queues:
internal_policy_queue = self._internal_policy_queues[brain_name]
try:
policy = cast(TFPolicy, internal_policy_queue.get_nowait())
policy = internal_policy_queue.get_nowait()
self.current_policy_snapshot[brain_name] = policy.get_weights()
except AgentManagerQueue.Empty:
pass

def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
) -> Policy:
"""
Creates policy with the wrapped trainer's create_policy function
The first policy encountered sets the wrapped

return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to GhostTrainer.

self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
self.policies[name_behavior_id] = policy
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy associated with name_behavior_id
:param name_behavior_id: Fully qualified behavior name

3
ml-agents/mlagents/trainers/optimizer/optimizer.py


Provides methods to update the Policy.
"""
def __init__(self):
self.reward_signals = {}
@abc.abstractmethod
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""

2
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


class TFOptimizer(Optimizer): # pylint: disable=W0223
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
super().__init__()
self.sess = policy.sess
self.policy = policy
self.update_dict: Dict[str, tf.Tensor] = {}

Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later

12
ml-agents/mlagents/trainers/policy/policy.py


@abstractmethod
def save(self, output_filepath: str, settings: SerializationSettings) -> None:
pass
@abstractmethod
def load_weights(self, values: List[np.ndarray]) -> None:
pass
@abstractmethod
def get_weights(self) -> List[np.ndarray]:
return []
@abstractmethod
def init_load_weights(self) -> None:
pass

56
ml-agents/mlagents/trainers/policy/tf_policy.py


feed_dict[self.action_masks] = mask
return feed_dict
def make_empty_memory(self, num_agents):
"""
Creates empty memory for use with RNNs
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[str], memory_matrix: Optional[np.ndarray]
) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[str]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]
return memory_matrix
def remove_memories(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents):
"""
Creates empty previous action for use with RNNs and discrete control
:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.num_branches), dtype=np.int)
def save_previous_action(
self, agent_ids: List[str], action_matrix: Optional[np.ndarray]
) -> None:
if action_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.previous_action_dict[agent_id] = action_matrix[index, :]
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = np.zeros((len(agent_ids), self.num_branches), dtype=np.int)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.previous_action_dict:
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:
if agent_id in self.previous_action_dict:
self.previous_action_dict.pop(agent_id)
def get_current_step(self):
"""
Gets current model step.

11
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.policy import Policy
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory

)
self.load = load
self.seed = seed
self.policy: TFPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""

return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
self.optimizer = PPOOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

11
ml-agents/mlagents/trainers/sac/trainer.py


from mlagents_envs.timers import timed
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy import Policy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations

self.load = load
self.seed = seed
self.policy: TFPolicy = None # type: ignore
self.policy: Policy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters

self._stats_reporter.add_stat(stat, np.mean(stat_list))
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

)
self.policy = policy
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
self.optimizer = SACOptimizer(
cast(TFPolicy, self.policy), self.trainer_settings
)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

max(1, self.step / self.reward_signal_steps_per_update)
)
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy

4
ml-agents/mlagents/trainers/trainer/rl_trainer.py


)
from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import timed
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.components.reward_signals import RewardSignalResult

for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str, optimizer: TFOptimizer) -> None:
def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(

9
ml-agents/mlagents/trainers/trainer/trainer.py


from mlagents_envs.logging_util import get_logger
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue

self.step: int = 0
self.artifact_path = artifact_path
self.summary_freq = self.trainer_settings.summary_freq
self.policies: Dict[str, TFPolicy] = {}
self.policies: Dict[str, Policy] = {}
@property
def stats_reporter(self):

@abc.abstractmethod
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TFPolicy:
) -> Policy:
"""
Creates policy
"""

def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
) -> None:
"""
Adds policy to trainer.

@abc.abstractmethod
def get_policy(self, name_behavior_id: str) -> TFPolicy:
def get_policy(self, name_behavior_id: str) -> Policy:
"""
Gets policy from trainer.
"""

正在加载...
取消
保存