浏览代码

[bug-fix] Change Simple1DEnvironment to spawn new agent IDs on reset (#3558)

/bug-failed-api-check
GitHub 5 年前
当前提交
213d2466
共有 2 个文件被更改,包括 103 次插入15 次删除
  1. 98
      ml-agents/mlagents/trainers/tests/simple_test_envs.py
  2. 20
      ml-agents/mlagents/trainers/tests/test_simple_rl.py

98
ml-agents/mlagents/trainers/tests/simple_test_envs.py


import random
from typing import Dict, List, Any
from typing import Dict, List, Any, Tuple
import numpy as np
from mlagents_envs.base_env import (

self.rewards: Dict[str, float] = {}
self.final_rewards: Dict[str, List[float]] = {}
self.step_result: Dict[str, BatchedStepResult] = {}
self.agent_id: Dict[str, int] = {}
self.agent_id[name] = 0
self.goal[name] = self.random.choice([-1, 1])
self.rewards[name] = 0
self.final_rewards[name] = []

delta = 1 if act else -1
else:
delta = self.action[name][0][0]
delta = clamp(delta, -self.step_size, self.step_size)
self.position[name] += delta
self.position[name] = clamp(self.position[name], -1, 1)

m_vector_obs = self._make_obs(self.goal[name])
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
if done:
self._reset_agent(name)
new_vector_obs = self._make_obs(self.goal[name])
(
m_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
) = self._construct_reset_step(
m_vector_obs,
new_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
name,
)
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
m_vector_obs,
m_reward,
m_done,
np.zeros(m_done.shape, dtype=bool),
m_agent_id,
action_mask,
def _construct_reset_step(
self,
vector_obs: List[np.ndarray],
new_vector_obs: List[np.ndarray],
reward: np.ndarray,
done: np.ndarray,
agent_id: np.ndarray,
action_mask: List[np.ndarray],
name: str,
) -> Tuple[List[np.ndarray], np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
new_reward = np.array([0.0], dtype=np.float32)
new_done = np.array([False], dtype=np.bool)
new_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
new_action_mask = self._generate_mask()
m_vector_obs = [
np.concatenate((old, new), axis=0)
for old, new in zip(vector_obs, new_vector_obs)
]
m_reward = np.concatenate((reward, new_reward), axis=0)
m_done = np.concatenate((done, new_done), axis=0)
m_agent_id = np.concatenate((agent_id, new_agent_id), axis=0)
if action_mask is not None:
action_mask = [
np.concatenate((old, new), axis=0)
for old, new in zip(action_mask, new_action_mask)
]
return m_vector_obs, m_reward, m_done, m_agent_id, action_mask
if done:
self._reset_agent(name)
def _generate_mask(self):
if self.discrete:

self.step_count[name] = 0
self.final_rewards[name].append(self.rewards[name])
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1
def reset(self) -> None: # type: ignore
for name in self.names:

m_vector_obs = self._make_obs(recurrent_obs_val)
m_reward = np.array([reward], dtype=np.float32)
m_done = np.array([done], dtype=np.bool)
m_agent_id = np.array([0], dtype=np.int32)
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
if done:
self._reset_agent(name)
recurrent_obs_val = (
self.goal[name] if self.step_count[name] <= self.num_show_steps else 0
)
new_vector_obs = self._make_obs(recurrent_obs_val)
(
m_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
) = self._construct_reset_step(
m_vector_obs,
new_vector_obs,
m_reward,
m_done,
m_agent_id,
action_mask,
name,
)
m_vector_obs, m_reward, m_done, m_done, m_agent_id, action_mask
m_vector_obs,
m_reward,
m_done,
np.zeros(m_done.shape, dtype=bool),
m_agent_id,
action_mask,
)

20
ml-agents/mlagents/trainers/tests/test_simple_rl.py


lambd: 0.95
learning_rate: 5.0e-3
learning_rate_schedule: constant
max_steps: 1500
max_steps: 2000
memory_size: 16
normalize: false
num_epoch: 3

@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_ppo(num_visual, use_discrete):
env = Simple1DEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
override_vals = {"learning_rate": 3.0e-4}
config = generate_config(PPO_CONFIG, override_vals)

}
config = generate_config(PPO_CONFIG, override_vals)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, config, success_threshold=0.9)
_check_environment_trains(env, config, success_threshold=0.5)
@pytest.mark.parametrize("use_discrete", [True, False])

@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_sac(num_visual, use_discrete):
env = Simple1DEnvironment(
[BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0
[BRAIN_NAME],
use_discrete=use_discrete,
num_visual=num_visual,
num_vector=0,
step_size=0.2,
)
override_vals = {"batch_size": 16, "learning_rate": 3e-4}
config = generate_config(SAC_CONFIG, override_vals)

}
config = generate_config(SAC_CONFIG, override_vals)
# The number of steps is pretty small for these encoders
_check_environment_trains(env, config, success_threshold=0.9)
_check_environment_trains(env, config, success_threshold=0.5)
override_vals = {"batch_size": 32, "use_recurrent": True}
override_vals = {"batch_size": 32, "use_recurrent": True, "max_steps": 2000}
config = generate_config(SAC_CONFIG, override_vals)
_check_environment_trains(env, config)

正在加载...
取消
保存