浏览代码

trains successfully

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
c494bfcc
共有 4 个文件被更改,包括 35 次插入33 次删除
  1. 1
      ml-agents/mlagents/trainers/policy/torch_policy.py
  2. 29
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  3. 15
      ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
  4. 23
      ml-agents/mlagents/trainers/torch/action_models.py

1
ml-agents/mlagents/trainers/policy/torch_policy.py


) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
#mask = None
print(self.discrete_act_size)
mask = torch.ones([len(decision_requests), np.sum(self.discrete_act_size)])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(

29
ml-agents/mlagents/trainers/tests/simple_test_envs.py


num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
action_size=1,
continuous_action_size=1,
discrete_action_size=1,
):
super().__init__(brain_names, False)
self.continuous_env = SimpleEnvironment(

num_vector,
vis_obs_size,
vec_obs_size,
action_size,
continuous_action_size,
)
self.discrete_env = SimpleEnvironment(
brain_names,

num_vector,
vis_obs_size,
vec_obs_size,
action_size,
discrete_action_size,
False,
True, # This is needed for env to generate masks correctly
action_size=discrete_action_size, # This is needed for env to generate masks correctly
self._make_obs_spec(), action_size, tuple(2 for _ in range(action_size))
self._make_obs_spec(), continuous_action_size, tuple(2 for _ in range(discrete_action_size))
self.action_size = action_size
self.continuous_action_size = continuous_action_size
self.discrete_action_size = discrete_action_size
self.continuous_action = {}
self.discrete_action = {}

for name in self.names:
cont_done = self.continuous_env._take_action(name)
cont_reward = self.continuous_env._compute_reward(name, cont_done)
disc_reward = self.discrete_env._compute_reward(name, disc_done)
reward = (cont_reward + disc_reward) / 2
reward = 0
for _pos in self.continuous_env.positions[name] + self.discrete_env.positions[name]:
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
self.continuous_env.positions[name] + self.discrete_env.positions[name]
)
self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(
name, all_done, reward

self.discrete_env.goal = self.goal
def set_actions(self, behavior_name: BehaviorName, action) -> None:
continuous_action = action[:, :self.action_size]
discrete_action = action[:, self.action_size:]
continuous_action = action[:, :self.continuous_action_size]
discrete_action = action[:, self.continuous_action_size:]
self.continuous_env.set_actions(behavior_name, continuous_action)
self.discrete_env.set_actions(behavior_name, discrete_action)

15
ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py


# config = attr.evolve(PPO_CONFIG)
# _check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_ppo():
env = HybridEnvironment([BRAIN_NAME], action_size=3)
config = attr.evolve(PPO_CONFIG)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=2.0)
#def test_hybrid_ppo():
# env = HybridEnvironment([BRAIN_NAME], action_size=1, step_size=0.2)
# config = attr.evolve(PPO_CONFIG, max_steps=10000)
# _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=5.0)
def test_2dhybrid_ppo():
env = HybridEnvironment([BRAIN_NAME], continuous_action_size=1, discrete_action_size=2, step_size=0.8)
new_hyperparams = attr.evolve(
PPO_CONFIG.hyperparameters, batch_size=128, buffer_size=1280
)
config = attr.evolve(PPO_CONFIG, hyperparameters=new_hyperparams, max_steps=100000)
_check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=5.0)
#
#@pytest.mark.parametrize("use_discrete", [True, False])

23
ml-agents/mlagents/trainers/torch/action_models.py


self.discrete_distribution = MultiCategoricalDistribution(self.encoding_size, discrete_act_size)
continuous_actions, discrete_actions= torch.split(actions, self.continuous_act_size, dim=1)
continuous_actions, discrete_actions = torch.split(actions, [self.continuous_act_size, len(self.discrete_act_size)], dim=1)
discrete_action_list = [discrete_actions[..., i] for i in range(discrete_actions.shape[-1])]
discrete_action_list = [discrete_actions[:, i] for i in range(len(self.discrete_act_size))]
log_probs = torch.add(continuous_log_probs, discrete_log_probs)
entropies = torch.add(continuous_entropies, discrete_entropies)
log_probs = torch.cat([continuous_log_probs, discrete_log_probs], dim=1)
entropies = torch.cat([continuous_entropies, torch.mean(discrete_entropies, dim=0).unsqueeze(0)], dim=1)
return log_probs, entropies
def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:

)
continuous_actions = torch.stack(continuous_action_list, dim=-1)
continuous_actions = continuous_actions[:, :, 0]
discrete_action_list = self._sample_action(discrete_dists)
discrete_entropies, discrete_log_probs, discrete_all_probs = ModelUtils.get_probs_and_entropy(

discrete_actions = discrete_actions[:, 0, :]
action = torch.cat([continuous_actions, discrete_actions.type(torch.float)], axis=1)
log_probs = torch.add(continuous_log_probs, discrete_log_probs)
entropies = torch.add(continuous_entropies, discrete_entropies)
#print("ac",action)
#print("clp",continuous_log_probs)
#print("dlp",discrete_log_probs)
#print("lp",log_probs)
#print("en",entropies)
action = torch.cat([continuous_actions, discrete_actions.type(torch.float)], dim=1)
log_probs = torch.cat([continuous_log_probs, discrete_log_probs], dim=1)
entropies = torch.cat([continuous_entropies, discrete_entropies], dim=1)
return (action, log_probs, entropies)
正在加载...
取消
保存