浏览代码

bc tests pass

/develop/action-spec-gym
Andrew Cohen 4 年前
当前提交
85e4db33
共有 8 个文件被更改,包括 176 次插入161 次删除
  1. 3
      ml-agents/mlagents/trainers/agent_processor.py
  2. 35
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 35
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  4. 5
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  5. 108
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  6. 13
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  7. 13
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  8. 125
      ml-agents/mlagents/trainers/torch/utils.py

3
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
from collections import defaultdict, Counter
import queue
import numpy as np
from mlagents_envs.base_env import (
DecisionSteps,

action_probs[prob_type] = prob_array[idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])#[0, :]
prev_action = self.policy.retrieve_previous_action([global_id]) # [0, :]
experience = AgentExperience(
obs=obs,
reward=step.reward,

35
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List, Tuple, Optional, Union
from typing import Any, Dict, List, Tuple, Optional
import numpy as np
from mlagents.torch_utils import torch, default_device
import copy

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
) -> Tuple[List[torch.Tensor], Union[torch.Tensor, List[torch.Tensor]], torch.Tensor, torch.Tensor]:
) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor, torch.Tensor]:
"""
:param vec_obs: List of vector observations.
:param vis_obs: List of visual observations.

:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
:return: Tuple of actions, log probabilities (dependent on all_log_probs), entropies, and
output memories, all as Torch Tensors.
"""

vec_obs, vis_obs, masks, memories, seq_len
)
action_list = self.actor_critic.sample_action(dists)
log_probs_list, entropies, all_logs = ModelUtils.get_probs_and_entropy(
log_probs_list, entropies, all_logs_list = ModelUtils.get_probs_and_entropy(
log_probs = ActionLogProbs.create(log_probs_list, self.behavior_spec.action_spec)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec, all_logs_list
)
return (
actions,
all_logs if all_log_probs else log_probs,
entropy_sum,
memories,
)
return (actions, log_probs, entropy_sum, memories)
def evaluate_actions(
self,

masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
) -> Tuple[ActionLogProbs, torch.Tensor, Dict[str, torch.Tensor]]:
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
log_probs = ActionLogProbs.create(log_probs_list, self.behavior_spec.action_spec)
log_probs_list, entropies, _ = ModelUtils.get_probs_and_entropy(
action_list, dists
)
log_probs = ActionLogProbs.create(
log_probs_list, self.behavior_spec.action_spec
)
# Use the sum of entropy across actions, not the mean
entropy_sum = torch.sum(entropies, dim=1)
return log_probs, entropy_sum, value_heads

vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = action.to_numpy_dict()
run_out["pre_action"] = action.to_numpy_dict()["continuous_action"] if self.use_continuous_act else None# Todo - make pre_action difference
run_out["pre_action"] = (
action.to_numpy_dict()["continuous_action"]
if self.use_continuous_act
else None
) # Todo - make pre_action difference
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["learning_rate"] = 0.0

35
ml-agents/mlagents/trainers/sac/optimizer_torch.py


if not discrete:
min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
else:
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
_branched_q1p = ModelUtils.break_into_branches(
q1p_out[name] * action_probs, self.act_size
)

for name in values.keys():
with torch.no_grad():
v_backup = min_policy_qs[name] - torch.sum(
_ent_coef * log_probs.continuous, dim=1
_ent_coef * log_probs.continuous_tensor, dim=1
)
value_loss = 0.5 * ModelUtils.masked_mean(
torch.nn.functional.mse_loss(values[name], v_backup), loss_masks

branched_per_action_ent = ModelUtils.break_into_branches(
log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
# We have to do entropy bonus per action branch
branched_ent_bonus = torch.stack(

def sac_policy_loss(
self,
log_probs: torch.Tensor,
log_probs: ActionLogProbs,
q1p_outs: Dict[str, torch.Tensor],
loss_masks: torch.Tensor,
discrete: bool,

if not discrete:
mean_q1 = mean_q1.unsqueeze(1)
batch_policy_loss = torch.mean(_ent_coef * log_probs.continuous - mean_q1, dim=1)
batch_policy_loss = torch.mean(
_ent_coef * log_probs.continuous_tensor - mean_q1, dim=1
)
action_probs = log_probs.exp()
action_probs = log_probs.all_discrete_tensor.exp()
log_probs * action_probs, self.act_size
log_probs.all_discrete_tensor * action_probs, self.act_size
)
branched_q_term = ModelUtils.break_into_branches(
mean_q1 * action_probs, self.act_size

return policy_loss
def sac_entropy_loss(
self, log_probs: torch.Tensor, loss_masks: torch.Tensor, discrete: bool
self, log_probs: ActionLogProbs, loss_masks: torch.Tensor, discrete: bool
target_current_diff = torch.sum(log_probs.continuous + self.target_entropy, dim=1)
target_current_diff = torch.sum(
log_probs.continuous_tensor + self.target_entropy, dim=1
)
entropy_loss = -1 * ModelUtils.masked_mean(
self._log_ent_coef * target_current_diff, loss_masks
)

log_probs * log_probs.exp(), self.act_size
log_probs.all_discrete_tensor * log_probs.all_discrete_tensor.exp(),
self.act_size,
)
target_current_diff_branched = torch.stack(
[

next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])]
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
actions = AgentAction.extract(batch)
#if self.policy.use_continuous_act:
# actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
#else:
# actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
memories_list = [
ModelUtils.list_to_tensor(batch["memory"][i])

masks=act_masks,
memories=memories,
seq_len=self.policy.sequence_length,
all_log_probs=not self.policy.use_continuous_act,
squeezed_actions = actions.continuous#squeeze(-1)
squeezed_actions = actions.continuous_tensor
sampled_actions.continuous,
sampled_actions.continuous_tensor,
memories=q_memories,
sequence_length=self.policy.sequence_length,
q2_grad=False,

5
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


next_observations = [
np.random.normal(size=shape) for shape in behavior_spec.observation_shapes
]
action = behavior_spec.action_spec.random_action(1)[0, :]
action = behavior_spec.action_spec.random_action(1)
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

)
buffer["vector_obs"].append(curr_split_obs.vector_observations)
buffer["next_vector_in"].append(next_split_obs.vector_observations)
buffer["actions"].append(action)
for _act_type, _act in action.items():
buffer[_act_type].append(_act)
buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
buffer["masks"].append(np.ones(1, dtype=np.float32))
buffer["done"] = np.zeros(number, dtype=np.float32)

108
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_ppo(use_discrete):
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# config = attr.evolve(PPO_TORCH_CONFIG)
# check_environment_trains(env, {BRAIN_NAME: config})
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_ppo(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(PPO_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_2d_ppo(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_2d_ppo(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )

# check_environment_trains(env, {BRAIN_NAME: config})
#@pytest.mark.parametrize("use_discrete", [True, False])
#@pytest.mark.parametrize("num_visual", [1, 2])
#def test_visual_ppo(num_visual, use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("num_visual", [1, 2])
# def test_visual_ppo(num_visual, use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=use_discrete,

# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("num_visual", [1, 2])
#@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
#def test_visual_advanced_ppo(vis_encode_type, num_visual):
# @pytest.mark.parametrize("num_visual", [1, 2])
# @pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
# def test_visual_advanced_ppo(vis_encode_type, num_visual):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=True,

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_recurrent_ppo(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_recurrent_ppo(use_discrete):
# env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
# new_network_settings = attr.evolve(
# PPO_TORCH_CONFIG.network_settings,

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
@pytest.mark.parametrize("use_discrete", [True])
@pytest.mark.parametrize("use_discrete", [True, False])
def test_simple_sac(use_discrete):
env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete)
config = attr.evolve(SAC_TORCH_CONFIG)

@pytest.mark.parametrize("use_discrete", [True])
def test_2d_sac(use_discrete):
env = SimpleEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
)
new_hyperparams = attr.evolve(
SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
)
config = attr.evolve(
SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
# @pytest.mark.parametrize("use_discrete", [True])
# def test_2d_sac(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, action_size=2, step_size=0.8
# )
# new_hyperparams = attr.evolve(
# SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000
# )
# config = attr.evolve(
# SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000
# )
# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
#@pytest.mark.parametrize("use_discrete", [True, False])
#@pytest.mark.parametrize("num_visual", [1, 2])
#def test_visual_sac(num_visual, use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("num_visual", [1, 2])
# def test_visual_sac(num_visual, use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=use_discrete,

# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("num_visual", [1, 2])
#@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
#def test_visual_advanced_sac(vis_encode_type, num_visual):
# @pytest.mark.parametrize("num_visual", [1, 2])
# @pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
# def test_visual_advanced_sac(vis_encode_type, num_visual):
# env = SimpleEnvironment(
# [BRAIN_NAME],
# use_discrete=True,

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_recurrent_sac(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_recurrent_sac(use_discrete):
# step_size = 0.2 if use_discrete else 0.5
# env = MemoryEnvironment(
# [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size

# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_ghost(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_ghost(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
# )

# check_environment_trains(env, {BRAIN_NAME: config})
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_ghost_fails(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_ghost_fails(use_discrete):
# env = SimpleEnvironment(
# [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], use_discrete=use_discrete
# )

# )
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_asymm_ghost(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_asymm_ghost(use_discrete):
# # Make opponent for asymmetric case
# brain_name_opp = BRAIN_NAME + "Opp"
# env = SimpleEnvironment(

# check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_simple_asymm_ghost_fails(use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_simple_asymm_ghost_fails(use_discrete):
# # Make opponent for asymmetric case
# brain_name_opp = BRAIN_NAME + "Opp"
# env = SimpleEnvironment(

# )
#
#
#@pytest.fixture(scope="session")
#def simple_record(tmpdir_factory):
# @pytest.fixture(scope="session")
# def simple_record(tmpdir_factory):
# def record_demo(use_discrete, num_visual=0, num_vector=1):
# env = RecordEnvironment(
# [BRAIN_NAME],

# return record_demo
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#@pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
#def test_gail(simple_record, use_discrete, trainer_config):
# @pytest.mark.parametrize("use_discrete", [True, False])
# @pytest.mark.parametrize("trainer_config", [PPO_TORCH_CONFIG, SAC_TORCH_CONFIG])
# def test_gail(simple_record, use_discrete, trainer_config):
# demo_path = simple_record(use_discrete)
# env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2)
# bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_gail_visual_ppo(simple_record, use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_gail_visual_ppo(simple_record, use_discrete):
# demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
# env = SimpleEnvironment(
# [BRAIN_NAME],

# check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
#
#
#@pytest.mark.parametrize("use_discrete", [True, False])
#def test_gail_visual_sac(simple_record, use_discrete):
# @pytest.mark.parametrize("use_discrete", [True, False])
# def test_gail_visual_sac(simple_record, use_discrete):
# demo_path = simple_record(use_discrete, num_visual=1, num_vector=0)
# env = SimpleEnvironment(
# [BRAIN_NAME],

13
ml-agents/mlagents/trainers/torch/components/bc/module.py


from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction, ActionLogProbs
class BCModule:

update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
return update_stats
def _behavioral_cloning_loss(self, selected_actions, log_probs, expert_actions):
def _behavioral_cloning_loss(self, selected_actions: AgentAction, log_probs: ActionLogProbs, expert_actions: torch.Tensor):
bc_loss = torch.nn.functional.mse_loss(selected_actions, expert_actions)
bc_loss = torch.nn.functional.mse_loss(selected_actions.continuous_tensor, expert_actions)
log_probs, self.policy.act_size
log_probs.all_discrete_tensor, self.policy.behavior_spec.action_spec.discrete_branches
)
bc_loss = torch.mean(
torch.stack(

else:
vis_obs = []
selected_actions, all_log_probs, _, _ = self.policy.sample_actions(
selected_actions, log_probs, _, _ = self.policy.sample_actions(
all_log_probs=True,
selected_actions, all_log_probs, expert_actions
selected_actions, log_probs, expert_actions
)
self.optimizer.zero_grad()
bc_loss.backward()

13
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


from mlagents.trainers.settings import CuriositySettings
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.torch.utils import ModelUtils
from mlagents.trainers.torch.utils import ModelUtils, AgentAction
from mlagents.trainers.torch.networks import NetworkBody
from mlagents.trainers.torch.layers import LinearEncoder, linear_layer
from mlagents.trainers.settings import NetworkSettings, EncoderType

Uses the current state embedding and the action of the mini_batch to predict
the next state embedding.
"""
actions = AgentAction.extract(mini_batch)
action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
action = actions.continuous_tensor
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
actions.discrete_tensor,
print(self.get_current_state(mini_batch), action)
forward_model_input = torch.cat(
(self.get_current_state(mini_batch), action), dim=1
)

action prediction (given the current and next state).
"""
predicted_action = self.predict_action(mini_batch)
actions = AgentAction.extract(mini_batch)
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
actions.continuous_tensor
- predicted_action
) ** 2
sq_difference = torch.sum(sq_difference, dim=1)

else:
true_action = torch.cat(
ModelUtils.actions_to_onehot(
ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
actions.discrete_tensor,
self._action_spec.discrete_branches,
),
dim=1,

125
ml-agents/mlagents/trainers/torch/utils.py


)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import ActionSpec, ActionBuffers
from mlagents_envs.base_env import ActionSpec
continuous: torch.Tensor
discrete: List[torch.Tensor]
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete], dim=1)
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete_list], dim=1)
if self.continuous is not None:
array_dict["continuous_action"] = ModelUtils.to_numpy(self.continuous)
if self.discrete is not None:
discrete_tensor = torch.stack(self.discrete, dim=-1)
array_dict["discrete_action"] = ModelUtils.to_numpy(discrete_tensor[:, 0, :])
if self.continuous_tensor is not None:
array_dict["continuous_action"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_action"] = ModelUtils.to_numpy(
self.discrete_tensor[:, 0, :]
)
tensor_list : List[torch.Tensor] = []
if self.continuous is not None:
tensor_list.append(self.continuous)
if self.discrete is not None:
tensor_list += self.discrete
return tensor_list
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(self.discrete_tensor)
return tensor_list
@staticmethod
def create(tensor_list: List[torch.Tensor], action_spec: ActionSpec) -> "AgentActions":
@staticmethod
def create(
tensor_list: List[torch.Tensor], action_spec: ActionSpec
) -> "AgentAction":
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None
_offset = 0

return AgentAction(continuous, discrete)
@staticmethod
def extract(buff: Dict[str, np.ndarray]) -> "AgentActions":
def extract(buff: Dict[str, np.ndarray]) -> "AgentAction":
continuous: torch.Tensor = None
discrete: List[torch.Tensor] = None
if "continuous_action" in buff:

discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
continuous: torch.Tensor
discrete: List[torch.Tensor]
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
all_discrete_list: List[torch.Tensor]
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete], dim=1)
return torch.cat([_disc.unsqueeze(-1) for _disc in self.discrete_list], dim=1)
@property
def all_discrete_tensor(self):
return torch.cat(self.all_discrete_list, dim=1)
if self.continuous is not None:
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(self.continuous)
if self.discrete is not None:
discrete_tensor = torch.stack(self.discrete, dim=-1)
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(discrete_tensor)
if self.continuous_tensor is not None:
array_dict["continuous_log_probs"] = ModelUtils.to_numpy(
self.continuous_tensor
)
if self.discrete_list is not None:
array_dict["discrete_log_probs"] = ModelUtils.to_numpy(self.discrete_tensor)
tensor_list : List[torch.Tensor] = []
if self.continuous is not None:
tensor_list.append(self.continuous)
if self.discrete is not None:
for _disc in self.discrete:
tensor_list.append(_disc.unsqueeze(-1))
return tensor_list
tensor_list: List[torch.Tensor] = []
if self.continuous_tensor is not None:
tensor_list.append(self.continuous_tensor)
if self.discrete_list is not None:
tensor_list.append(self.discrete_tensor)
return tensor_list
@staticmethod
def create(tensor_list: List[torch.Tensor], action_spec: ActionSpec) -> "ActionLogProbs":
@staticmethod
def create(
log_prob_list: List[torch.Tensor],
action_spec: ActionSpec,
all_log_prob_list: List[torch.Tensor] = None,
) -> "ActionLogProbs":
continuous = tensor_list[0]
continuous = log_prob_list[0]
discrete = tensor_list[_offset:]
return ActionLogProbs(continuous, discrete)
discrete = log_prob_list[_offset:]
return ActionLogProbs(continuous, discrete, all_log_prob_list)
@staticmethod
def extract(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":

continuous = ModelUtils.list_to_tensor(buff["continuous_log_probs"])
if "discrete_log_probs" in buff:
discrete_tensor = ModelUtils.list_to_tensor(buff["discrete_log_probs"])
discrete = [discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])]
return ActionLogProbs(continuous, discrete)
discrete = [
discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
]
return ActionLogProbs(continuous, discrete, None)
class ModelUtils:
# Minimum supported side for each encoder type. If refactoring an encoder, please
# adjust these also.

)
@staticmethod
def to_action_buffers(agent_actions: AgentAction, action_spec: ActionSpec) -> ActionBuffers:
"""
Converts a list of action Tensors to an ActionBuffers tuple. Implicitly
assumes order of actions in 'actions' is continuous, discrete
"""
return ActionBuffers(agent_actions.continuous.detach().cpu().numpy(), agent_actions.discrete.detach().cpu().numpy())
@staticmethod
def list_to_tensor(
ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = None
) -> torch.Tensor:

entropies_list.append(action_dist.entropy())
if isinstance(action_dist, DiscreteDistInstance):
all_probs_list.append(action_dist.all_log_prob())
#log_probs = torch.stack(log_probs_list, dim=-1)
# log_probs = log_probs.squeeze(-1)
all_probs = None
else:
all_probs = torch.cat(all_probs_list, dim=-1)
return log_probs_list, entropies, all_probs
# all_probs = None
# else:
# all_probs = torch.cat(all_probs_list, dim=-1)
return log_probs_list, entropies, all_probs_list
@staticmethod
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

alpha=tau,
out=target_param.data,
)
正在加载...
取消
保存