浏览代码

fix tests

/MLA-1734-demo-provider
Ruo-Ping Dong 4 年前
当前提交
471a2e82
共有 6 个文件被更改,包括 80 次插入26 次删除
  1. 7
      gym-unity/gym_unity/tests/test_gym.py
  2. 20
      ml-agents-envs/mlagents_envs/base_env.py
  3. 4
      ml-agents-envs/mlagents_envs/rpc_utils.py
  4. 4
      ml-agents-envs/mlagents_envs/tests/test_steps.py
  5. 10
      ml-agents/mlagents/trainers/tests/mock_brain.py
  6. 61
      ml-agents/mlagents/trainers/tests/simple_test_envs.py

7
gym-unity/gym_unity/tests/test_gym.py


] * number_visual_observations
rewards = np.array(num_agents * [1.0])
agents = np.array(range(0, num_agents))
return DecisionSteps(obs, rewards, agents, None), TerminalSteps.empty(specs)
group_id = np.array(num_agents * [0])
group_rewards = np.array(num_agents * [0.0])
return (
DecisionSteps(obs, rewards, agents, None, group_id, group_rewards),
TerminalSteps.empty(specs),
)
def setup_mock_unityenvironment(mock_env, mock_spec, mock_decision, mock_termination):

20
ml-agents-envs/mlagents_envs/base_env.py


obs: List[np.ndarray]
reward: float
group_reward: float
group_reward: float
class DecisionSteps(Mapping):

this simulation step.
"""
def __init__(self, obs, reward, group_reward, agent_id, action_mask, group_id):
def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward):
self.group_reward: np.ndarray = group_reward
self.group_reward: np.ndarray = group_reward
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
@property

return DecisionStep(
obs=agent_obs,
reward=self.reward[agent_index],
group_reward=self.group_reward[agent_index],
group_reward=self.group_reward[agent_index],
)
def __iter__(self) -> Iterator[Any]:

return DecisionSteps(
obs=obs,
reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
)

obs: List[np.ndarray]
reward: float
group_reward: float
group_reward: float
class TerminalSteps(Mapping):

across simulation steps.
"""
def __init__(self, obs, reward, group_reward, interrupted, agent_id, group_id):
def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward):
self.group_reward: np.ndarray = group_reward
self.group_reward: np.ndarray = group_reward
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
@property

return TerminalStep(
obs=agent_obs,
reward=self.reward[agent_index],
group_reward=self.group_reward[agent_index],
group_reward=self.group_reward[agent_index],
)
def __iter__(self) -> Iterator[Any]:

return TerminalSteps(
obs=obs,
reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
)

4
ml-agents-envs/mlagents_envs/rpc_utils.py


DecisionSteps(
decision_obs_list,
decision_rewards,
decision_group_rewards,
decision_group_rewards,
terminal_group_rewards,
terminal_group_rewards,
),
)

4
ml-agents-envs/mlagents_envs/tests/test_steps.py


reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
action_mask=[np.zeros((3, 4), dtype=np.bool)],
group_id=np.array(range(3), dtype=np.int32),
group_reward=np.array(range(3), dtype=np.float32),
)
assert ds.agent_id_to_index[10] == 0

reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
interrupted=np.array([1, 0, 1], dtype=np.bool),
group_id=np.array(range(3), dtype=np.int32),
group_reward=np.array(range(3), dtype=np.float32),
)
assert ts.agent_id_to_index[10] == 0

10
ml-agents/mlagents/trainers/tests/mock_brain.py


reward = np.array(num_agents * [1.0], dtype=np.float32)
interrupted = np.array(num_agents * [False], dtype=np.bool)
agent_id = np.arange(num_agents, dtype=np.int32)
group_id = np.array(num_agents * [0], dtype=np.int32)
group_reward = np.array(num_agents * [0.0], dtype=np.float32)
TerminalSteps(obs_list, reward, interrupted, agent_id),
TerminalSteps(
obs_list, reward, interrupted, agent_id, group_id, group_reward
),
DecisionSteps(obs_list, reward, agent_id, action_mask),
DecisionSteps(
obs_list, reward, agent_id, action_mask, group_id, group_reward
),
TerminalSteps.empty(behavior_spec),
)

61
ml-agents/mlagents/trainers/tests/simple_test_envs.py


self.agent_id[name] = self.agent_id[name] + 1
def _make_batched_step(
self, name: str, done: bool, reward: float
self, name: str, done: bool, reward: float, group_reward: float
m_group_id = np.array([0], dtype=np.int32)
m_group_reward = np.array([group_reward], dtype=np.float32)
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
decision_step = DecisionSteps(
m_vector_obs, m_reward, m_agent_id, action_mask, m_group_id, m_group_reward
)
terminal_step = TerminalSteps.empty(self.behavior_spec)
if done:
self.final_rewards[name].append(self.rewards[name])

new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
new_vector_obs, new_reward, new_agent_id, new_action_mask
new_vector_obs,
new_reward,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
m_vector_obs,
m_reward,
np.array([False], dtype=np.bool),
m_agent_id,
m_group_id,
m_group_reward,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
return new_reward, new_done, new_agent_id, new_action_mask
new_group_id = np.array([0], dtype=np.int32)
new_group_reward = np.array([0.0], dtype=np.float32)
return (
new_reward,
new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
)
def step(self) -> None:
assert all(action is not None for action in self.action.values())

reward = self._compute_reward(name, done)
self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(name, done, reward)
self.step_result[name] = self._make_batched_step(name, done, reward, 0.0)
self.step_result[name] = self._make_batched_step(name, False, 0.0)
self.step_result[name] = self._make_batched_step(name, False, 0.0, 0.0)
@property
def reset_parameters(self) -> Dict[str, str]:

self.num_show_steps = 2
def _make_batched_step(
self, name: str, done: bool, reward: float
self, name: str, done: bool, reward: float, group_reward: float
) -> Tuple[DecisionSteps, TerminalSteps]:
recurrent_obs_val = (
self.goal[name] if self.step_count[name] <= self.num_show_steps else 0

m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
m_group_id = np.array([0], dtype=np.int32)
m_group_reward = np.array([group_reward], dtype=np.float32)
action_mask = self._generate_mask()
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
terminal_step = TerminalSteps.empty(self.behavior_spec)

new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
new_vector_obs, new_reward, new_agent_id, new_action_mask
new_vector_obs,
new_reward,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
m_vector_obs,
m_reward,
np.array([False], dtype=np.bool),
m_agent_id,
m_group_id,
m_group_reward,
)
return (decision_step, terminal_step)

正在加载...
取消
保存