浏览代码

bc works

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
1d234d1d
共有 9 个文件被更改,包括 70 次插入39 次删除
  1. 1
      ml-agents/mlagents/trainers/policy/policy.py
  2. 4
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 2
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  4. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 19
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  6. 23
      ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
  7. 34
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  8. 8
      ml-agents/mlagents/trainers/torch/networks.py
  9. 16
      ml-agents/mlagents/trainers/torch/utils.py

1
ml-agents/mlagents/trainers/policy/policy.py


) -> None:
if memory_matrix is None:
return
for index, agent_id in enumerate(agent_ids):
self.memory_dict[agent_id] = memory_matrix[index, :]

4
ml-agents/mlagents/trainers/policy/torch_policy.py


) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:
if self.action_spec.discrete_size > 0:
mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

run_out["action"] = action.to_numpy_dict()
run_out["pre_action"] = (
action.to_numpy_dict()["continuous_action"]
if self.use_continuous_act
if self.action_spec.continuous_size > 0
else None
) # Todo - make pre_action difference
run_out["log_probs"] = log_probs.to_numpy_dict()

2
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


)
stats_needed.update(reward_signal.stats_name_to_update_name)
for tens, d in feed_dict.items():
print(tens, d)
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]

2
ml-agents/mlagents/trainers/ppo/trainer.py


behavior_spec,
self.trainer_settings,
condition_sigma_on_obs=False, # Faster training for PPO
separate_critic=behavior_spec.action_spec.is_continuous(),
separate_critic=behavior_spec.action_spec.continuous_size > 0,
)
return policy

19
ml-agents/mlagents/trainers/tests/simple_test_envs.py


num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
continuous_action_size=1,
discrete_action_size=1,
continuous_action_size=0,
discrete_action_size=0,
):
super().__init__()
self.num_visual = num_visual

class MemoryEnvironment(SimpleEnvironment):
def __init__(self, brain_names, use_discrete, step_size=0.2):
super().__init__(brain_names, use_discrete, step_size=step_size)
def __init__(
self,
brain_names,
continuous_action_size=1,
discrete_action_size=1,
step_size=0.2,
):
super().__init__(
brain_names,
continuous_action_size=continuous_action_size,
discrete_action_size=discrete_action_size,
step_size=step_size,
)
# Number of steps to reveal the goal for. Lower is harder. Should be
# less than 1/step_size to force agent to use memory
self.num_show_steps = 2

23
ml-agents/mlagents/trainers/tests/torch/test_hybrid.py


SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
def test_recurrent_ppo():
env = MemoryEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=1
)
new_network_settings = attr.evolve(
PPO_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
PPO_TORCH_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=128,
)
config = attr.evolve(
PPO_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=5000,
)
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_hybrid_ppo():
env = SimpleEnvironment(
[BRAIN_NAME], continuous_action_size=1, discrete_action_size=1, step_size=0.8

34
ml-agents/mlagents/trainers/torch/components/bc/module.py


log_probs: ActionLogProbs,
expert_actions: torch.Tensor,
) -> torch.Tensor:
if self.policy.use_continuous_act:
bc_loss = torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions
bc_loss = 0
if self.policy.action_spec.continuous_size > 0:
bc_loss += torch.nn.functional.mse_loss(
selected_actions.continuous_tensor, expert_actions.continuous_tensor
)
if self.policy.action_spec.discrete_size > 0:
one_hot_expert_actions = ModelUtils.actions_to_onehot(
expert_actions.discrete_tensor,
self.policy.action_spec.discrete_branches,
else:
bc_loss = torch.mean(
bc_loss += torch.mean(
torch.stack(
[
torch.sum(

)
for log_prob_branch, expert_actions_branch in zip(
log_prob_branches, expert_actions
log_prob_branches, one_hot_expert_actions
print(bc_loss)
return bc_loss
def _update_batch(

"""
vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
act_masks = None
if self.policy.use_continuous_act:
expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["continuous_action"]
)
else:
raw_expert_actions = ModelUtils.list_to_tensor(
mini_batch_demo["discrete_action"], dtype=torch.long
)
expert_actions = ModelUtils.actions_to_onehot(
raw_expert_actions, self.policy.act_size
)
expert_actions = AgentAction.from_dict(mini_batch_demo)
if self.policy.action_spec.discrete_size > 0:
act_masks = ModelUtils.list_to_tensor(
np.ones(
(

else:
vis_obs = []
selected_actions, log_probs, _, _ = self.policy.sample_actions(
selected_actions, log_probs, _, _, _ = self.policy.sample_actions(
vec_obs,
vis_obs,
masks=act_masks,

8
ml-agents/mlagents/trainers/torch/networks.py


else:
critic_mem = None
actor_mem = None
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
)
log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
value_outputs, critic_mem_outs = self.critic(

else:
critic_mem = None
actor_mem = None
encoding, memories = self.network_body(
vec_inputs, vis_inputs, memories=memories, sequence_length=sequence_length
encoding, actor_mem_outs = self.network_body(
vec_inputs, vis_inputs, memories=actor_mem, sequence_length=sequence_length
)
action, log_probs, entropies = self.action_model(encoding, masks)
value_outputs, critic_mem_outs = self.critic(

16
ml-agents/mlagents/trainers/torch/utils.py


@property
def flattened_size(self) -> int:
if self._specs.is_continuous():
return self._specs.continuous_size
else:
return sum(self._specs.discrete_branches)
return self._specs.continuous_size + sum(self._specs.discrete_branches)
if self._specs.is_continuous():
return action.continuous_tensor
else:
return torch.cat(
action_list: List[torch.Tensor] = []
if self._specs.continuous_size < 0:
action_list.append(action.continuous_tensor)
if self._specs.discrete_size < 0:
flat_discrete = torch.cat(
ModelUtils.actions_to_onehot(
torch.as_tensor(action.discrete_tensor, dtype=torch.long),
self._specs.discrete_branches,

action_list.append(flat_discrete)
return torch.cat(action_list, dim=1)
@staticmethod
def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:

正在加载...
取消
保存