浏览代码

config fix; basic sac

/develop/bisim-sac-transfer
yanchaosun 4 年前
当前提交
f81feec4
共有 7 个文件被更改,包括 131 次插入22 次删除
  1. 5
      config/ppo_transfer/CrawlerStatic.yaml
  2. 9
      config/ppo_transfer/TransferCrawlerStatic.yaml
  3. 66
      ml-agents/mlagents/trainers/sac_transfer/optimizer.py
  4. 11
      ml-agents/mlagents/trainers/sac_transfer/trainer.py
  5. 47
      ml-agents/mlagents/trainers/settings.py
  6. 4
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py
  7. 11
      ml-agents/mlagents/trainers/trainer_util.py

5
config/ppo_transfer/CrawlerStatic.yaml


num_epoch: 3
learning_rate_schedule: constant
model_schedule: constant
encoder_layers: 3
policy_layers: 0
encoder_layers: 2
action_layers: 2
policy_layers: 2
forward_layers: 0
value_layers: 3
feature_size: 128

9
config/ppo_transfer/TransferCrawlerStatic.yaml


num_epoch: 3
learning_rate_schedule: constant
model_schedule: constant
encoder_layers: 3
policy_layers: 0
encoder_layers: 2
action_layers: 2
policy_layers: 2
forward_layers: 0
value_layers: 3
feature_size: 128

load_value: false
load_action: true
load_model: true
train_action: true
train_action: false
transfer_path: "results/csold-bisim/CrawlerStatic"
transfer_path: "results/csold-bisim-l2/CrawlerStatic"
network_settings:
normalize: true
hidden_units: 512

66
ml-agents/mlagents/trainers/sac_transfer/optimizer.py


from mlagents.tf_utils import tf
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.sac_transfer.network import SACPolicyNetwork, SACTargetNetwork
from mlagents.trainers.policy.transfer_policy import TransferPolicy
from mlagents.trainers.settings import TrainerSettings, SACSettings
from mlagents.trainers.settings import TrainerSettings, SACSettings, SACTransferSettings
EPSILON = 1e-6 # Small value to avoid divide by zero

TARGET_SCOPE = "target_network"
class SACOptimizer(TFOptimizer):
class SACTransferOptimizer(TFOptimizer):
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
"""
Takes a Unity environment and model-specific hyper-parameters and returns the

:param tau: Strength of soft-Q update.
:param m_size: Size of brain memory.
"""
hyperparameters: SACTransferSettings = cast(
SACTransferSettings, trainer_params.hyperparameters
)
self.batch_size = hyperparameters.batch_size
self.separate_value_train = hyperparameters.separate_value_train
self.separate_policy_train = hyperparameters.separate_policy_train
self.use_var_encoder = hyperparameters.use_var_encoder
self.use_var_predict = hyperparameters.use_var_predict
self.with_prior = hyperparameters.with_prior
self.use_inverse_model = hyperparameters.use_inverse_model
self.predict_return = hyperparameters.predict_return
self.reuse_encoder = hyperparameters.reuse_encoder
self.use_bisim = hyperparameters.use_bisim
self.use_alter = hyperparameters.use_alter
self.in_batch_alter = hyperparameters.in_batch_alter
self.in_epoch_alter = hyperparameters.in_epoch_alter
self.op_buffer = hyperparameters.use_op_buffer
self.train_encoder = hyperparameters.train_encoder
self.train_action = hyperparameters.train_action
self.train_model = hyperparameters.train_model
self.train_policy = hyperparameters.train_policy
self.train_value = hyperparameters.train_value
# Transfer
self.use_transfer = hyperparameters.use_transfer
self.transfer_path = (
hyperparameters.transfer_path
)
self.smart_transfer = hyperparameters.smart_transfer
self.conv_thres = hyperparameters.conv_thres
self.sac_update_dict: Dict[str, tf.Tensor] = {}
self.model_update_dict: Dict[str, tf.Tensor] = {}
self.model_only_update_dict: Dict[str, tf.Tensor] = {}
self.bisim_update_dict: Dict[str, tf.Tensor] = {}
policy.create_tf_graph()
policy.create_tf_graph(
# hyperparameters.encoder_layers,
# hyperparameters.action_layers,
# hyperparameters.policy_layers,
# hyperparameters.forward_layers,
# hyperparameters.inverse_layers,
# hyperparameters.feature_size,
# hyperparameters.action_feature_size,
# self.use_transfer,
# self.separate_policy_train,
# self.use_var_encoder,
# self.use_var_predict,
# self.predict_return,
# self.use_inverse_model,
# self.reuse_encoder,
# self.use_bisim,
)
hyperparameters: SACSettings = cast(
SACSettings, trainer_params.hyperparameters
)
lr = hyperparameters.learning_rate
lr_schedule = hyperparameters.learning_rate_schedule
max_step = trainer_params.max_steps

11
ml-agents/mlagents/trainers/sac_transfer/trainer.py


from mlagents_envs.timers import timed
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.sac.optimizer import SACOptimizer
from mlagents.trainers.sac_transfer.optimizer import SACTransferOptimizer
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.brain import BrainParameters

BUFFER_TRUNCATE_PERCENT = 0.8
class SACTrainer(RLTrainer):
class SACTransferTrainer(RLTrainer):
The SACTrainer is an implementation of the SAC algorithm, with support
for discrete actions and recurrent networks.
The SACTransferTrainer is a variant of the SAC algorithm which supports model transfer
"""
def __init__(

self.load = load
self.seed = seed
self.policy: NNPolicy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.optimizer: SACTransferOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)

if not isinstance(policy, NNPolicy):
raise RuntimeError("Non-SACPolicy passed to SACTrainer.add_policy()")
self.policy = policy
self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
self.optimizer = SACTransferOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly

47
ml-agents/mlagents/trainers/settings.py


def _reward_signal_steps_per_update_default(self):
return self.steps_per_update
@attr.s(auto_attribs=True)
class SACTransferSettings(SACSettings):
model_schedule: ScheduleType = ScheduleType.LINEAR
separate_value_train: bool = False
separate_policy_train: bool = False
separate_value_net: bool = False
use_var_encoder: bool = False
use_var_predict: bool = False
with_prior: bool = False
use_inverse_model: bool = False
predict_return: bool = False
reuse_encoder: bool = False
use_alter: bool = False
in_batch_alter: bool = False
in_epoch_alter: bool = False
use_op_buffer: bool = False
train_encoder: bool = True
train_action: bool = True
train_model: bool = True
train_policy: bool = True
train_value: bool = True
use_bisim: bool = False
# Transfer
use_transfer: bool = False
smart_transfer: bool = False
conv_thres: float = 1e-3
transfer_path: str = ""
load_model: bool = True
load_value: bool = False
load_policy: bool = False
load_encoder: bool = False
load_action: bool = False
# Network
encoder_layers: int = 1
action_layers: int = 1
policy_layers: int = 1
value_layers: int = 1
forward_layers: int = 1
inverse_layers: int = 1
feature_size: int = 16
action_feature_size: int = 16
class RewardSignalType(Enum):
EXTRINSIC: str = "extrinsic"

PPO: str = "ppo"
SAC: str = "sac"
PPO_Transfer: str = "ppo_transfer"
SAC_Transfer: str = "sac_transfer"
TrainerType.PPO_Transfer: PPOTransferSettings}
TrainerType.PPO_Transfer: PPOTransferSettings, TrainerType.SAC_Transfer: SACTransferSettings}
return _mapping[self]

4
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


obs.append(
np.ones((1, self.vec_obs_size), dtype=np.float32) * (2 * i - j)
)
elif self.obs_spec_type == "long":
elif self.obs_spec_type == "long-n":
elif self.obs_spec_type == "longpre":
elif self.obs_spec_type == "longpre-n":
for name in self.names:
for _ in range(self.extra_obs_size):
obs.append(np.random.randn(1, self.vec_obs_size))

11
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.ppo_transfer.trainer import PPOTransferTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.sac_transfer.trainer import SACTransferTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
from mlagents.trainers.settings import TrainerSettings, TrainerType

)
elif trainer_type == TrainerType.PPO_Transfer:
trainer = PPOTransferTrainer(
brain_name,
min_lesson_length,
trainer_settings,
train_model,
load_model,
seed,
trainer_artifact_path,
)
elif trainer_type == TrainerType.SAC_Transfer:
trainer = SACTransferTrainer(
brain_name,
min_lesson_length,
trainer_settings,

正在加载...
取消
保存