浏览代码

sac continuous and discrete train

/develop/action-spec-gym
Andrew Cohen 4 年前
当前提交
056630d7
共有 4 个文件被更改,包括 68 次插入161 次删除
  1. 1
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  2. 27
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  3. 80
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  4. 121
      ml-agents/mlagents/trainers/torch/utils.py

1
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


advantage = advantages.unsqueeze(-1)
decay_epsilon = self.hyperparameters.epsilon
r_theta = torch.exp(log_probs - old_log_probs)
p_opt_a = r_theta * advantage
p_opt_b = (

27
ml-agents/mlagents/trainers/sac/optimizer_torch.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch.networks import ValueNetwork
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.timers import timed
from mlagents.trainers.exception import UnityTrainerException

def sac_value_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
values: Dict[str, torch.Tensor],
q1p_out: Dict[str, torch.Tensor],
q2p_out: Dict[str, torch.Tensor],

for name in values.keys():
with torch.no_grad():
v_backup = min_policy_qs[name] - torch.sum(
_ent_coef * log_probs, dim=1
_ent_coef * log_probs.continuous, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0)
if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1)
batch_policy_loss = torch.mean(_ent_coef * log_probs.continuous - mean_q1, dim=1)
policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
else:
action_probs = log_probs.exp()

) -> torch.Tensor:
if not discrete:
with torch.no_grad():
target_current_diff = torch.sum(log_probs + self.target_entropy, dim=1)
target_current_diff = torch.sum(log_probs.continuous + self.target_entropy, dim=1)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)

vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
if self.policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
actions = AgentAction.extract(batch)
#if self.policy.use_continuous_act:
# actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
#else:
# actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length
)
if self.policy.use_continuous_act:
squeezed_actions = actions.squeeze(-1)
squeezed_actions = actions.continuous#squeeze(-1)
sampled_actions,
sampled_actions.continuous,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,

memories=q_memories,
sequence_length=self.policy.sequence_length,
)
q1_stream = self._condense_q_streams(q1_out, actions)
q2_stream = self._condense_q_streams(q2_out, actions)
q1_stream = self._condense_q_streams(q1_out, actions.discrete_tensor)
q2_stream = self._condense_q_streams(q2_out, actions.discrete_tensor)
with torch.no_grad():
target_values, _ = self.target_network(

80
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_2d_ppo(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
)
config = attr.evolve(
PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config})
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_ppo(use_discrete):
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# config = attr.evolve(PPO_TORCH_CONFIG)
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_2d_ppo(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640
# )
# config = attr.evolve(
# PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config})
#@pytest.mark.parametrize("use_discrete", [True, False])

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_sac(use_discrete):
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# config = attr.evolve(SAC_TORCH_CONFIG)
# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_2d_sac(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
#
#
@pytest.mark.parametrize("use_discrete", [True])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(SAC_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
#@pytest.mark.parametrize("use_discrete", [True, False])
#@pytest.mark.parametrize("num_visual", [1, 2])
#def test_visual_sac(num_visual, use_discrete):

121
ml-agents/mlagents/trainers/torch/utils.py


from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance
class ActionSpaceTuple(NamedTuple):
continuous: torch.Tensor
discrete: List[torch.Tensor]
@property
def discrete_tensor(self):
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete], dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
array_dict: Dict[str, np.ndarray] = {}

discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
return AgentAction(continuous, discrete)
class ActionLogProbs(ActionSpaceTuple):
class ActionLogProbs(NamedTuple):
@property
def discrete_tensor(self):
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete], dim=1)
def to_numpy_dict(self) -> Dict[str, np.ndarray]:
array_dict: Dict[str, np.ndarray] = {}
if self.continuous is not None:

if self.continuous is not None:
tensor_list.append(self.continuous)
if self.discrete is not None:
tensor_list += self.discrete
for _disc in self.discrete:
tensor_list.append(_disc.unsqueeze(-1))
return torch.stack(self.to_tensor_list(), dim=-1)
return torch.cat(self.to_tensor_list(), dim=1)
@staticmethod
def create(tensor_list: List[torch.Tensor], action_spec: ActionSpec) -> "ActionLogProbs":

discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
return ActionLogProbs(continuous, discrete)
#def to_numpy_dict(self) -> Dict[str, np.ndarray]:
# action_arrays_dict: Dict[str, np.ndarray] = {}
# if self.continuous is not None:
# action_arrays_dict["continuous_action"] = ModelUtils.to_numpy(self.continuous.unsqueeze(-1)[:, :, 0])
# if self.discrete is not None:
# discrete_tensor = torch.stack(self.discrete, dim=-1)
# action_arrays_dict["discrete_action"] = ModelUtils.to_numpy(discrete_tensor[:, 0, :])
# return action_arrays_dict
#def to_tensor_list(self) -> List[torch.Tensor]:
# tensor_list: List[torch.Tensor] = []
# if self.continuous is not None:
# tensor_list.append(self.continuous)
# if self.discrete is not None:
# tensor_list += self.discrete
# return tensor_list
#def flatten(self) -> torch.Tensor:
# return torch.stack(self.to_tensor_list(), dim=-1)
#@staticmethod
#def extract_agent_action(buff: Dict[str, np.ndarray]) -> "AgentAction":
# continuous: torch.Tensor = None
# discrete: List[torch.Tensor] = None
# if "continuous_action" in buff:
# continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
# if "discrete_action" in buff:
# discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_action"])
# discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
# return AgentAction(continuous, discrete)
#
#@staticmethod
#def create_agent_action(action_tensors: List[torch.Tensor], action_spec: ActionSpec) -> "AgentAction":
# continuous: torch.Tensor = None
# discrete: List[torch.Tensor] = None
# _offset = 0
# if action_spec.continuous_size > 0:
# continuous = action_tensors[0]
# _offset = 1
# if action_spec.discrete_size > 0:
# discrete = action_tensors[_offset:]
# return AgentAction(continuous, discrete)
#def to_numpy_dict(self) -> Dict[str, np.ndarray]:
# log_prob_arrays_dict: Dict[str, np.ndarray] = {}
# if self.continuous is not None:
# log_prob_arrays_dict["continuous_log_probs"] = ModelUtils.to_numpy(self.continuous)
# if self.discrete is not None:
# discrete_tensor = torch.stack(self.discrete, dim=-1)
# log_prob_arrays_dict["discrete_log_probs"] = ModelUtils.to_numpy(discrete_tensor.squeeze(1))
# return log_prob_arrays_dict
#def to_tensor_list(self) -> List[torch.Tensor]:
# tensor_list: List[torch.Tensor] = []
# if self.continuous is not None:
# tensor_list.append(self.continuous)
# if self.discrete is not None:
# tensor_list += self.discrete
# return tensor_list
#def flatten(self) -> torch.Tensor:
# return torch.stack(self.to_tensor_list(), dim=-1)
#@staticmethod
#def extract_action_log_probs(buff: Dict[str, np.ndarray]) -> "AgentAction":
# continuous: torch.Tensor = None
# discrete: List[torch.Tensor] = None
# if "continuous_action" in buff:
# continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"]).unsqueeze(-1)
# if "discrete_action" in buff:
# discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
# discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
# return ActionLogProbs(continuous, discrete)
#@staticmethod
#def create_action_log_probs(log_prob_tensors: List[torch.Tensor], action_spec: ActionSpec) -> "AgentAction":
# continuous: torch.Tensor = None
# discrete: List[torch.Tensor] = None
# _offset = 0
# if action_spec.continuous_size > 0:
# continuous = log_prob_tensors[0]
# _offset = 1
# if action_spec.discrete_size > 0:
# discrete = log_prob_tensors[_offset:]
# return ActionLogProbs(continuous, discrete)
class ModelUtils:
# Minimum supported side for each encoder type. If refactoring an encoder, please
# adjust these also.

"""
return ActionBuffers(agent_actions.continuous.detach().cpu().numpy(), agent_actions.discrete.detach().cpu().numpy())
#@staticmethod
#def action_buffers_to_agent_action(
# action_buffers: ActionBuffers, dtype: Optional[torch.dtype] = None
#) -> AgentAction:
# """
# Converts ActionBuffers fields into a AgentAction fields
# """
# return AgentAction(torch.as_tensor(np.asanyarray(action_buffers.continuous), dtype=dtype),
#torch.as_tensor(np.asanyarray(_disc), dtype=dtype))
@staticmethod
def list_to_tensor(

正在加载...
取消
保存