浏览代码

Merge branch 'develop-action-buffer' into develop-hybrid-actions-singleton

/develop/actionmodel-csharp
Andrew Cohen 4 年前
当前提交
35769b53
共有 20 个文件被更改,包括 61 次插入76 次删除
  1. 2
      Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
  2. 16
      ml-agents-envs/mlagents_envs/base_env.py
  3. 4
      ml-agents-envs/mlagents_envs/environment.py
  4. 9
      ml-agents-envs/mlagents_envs/tests/test_envs.py
  5. 7
      ml-agents/mlagents/trainers/agent_processor.py
  6. 8
      ml-agents/mlagents/trainers/env_manager.py
  7. 2
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  8. 36
      ml-agents/mlagents/trainers/policy/policy.py
  9. 2
      ml-agents/mlagents/trainers/policy/tf_policy.py
  10. 9
      ml-agents/mlagents/trainers/policy/torch_policy.py
  11. 8
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  12. 2
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  13. 2
      ml-agents/mlagents/trainers/simple_env_manager.py
  14. 2
      ml-agents/mlagents/trainers/subprocess_env_manager.py
  15. 6
      ml-agents/mlagents/trainers/tests/mock_brain.py
  16. 4
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  17. 2
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  18. 4
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  19. 4
      ml-agents/mlagents/trainers/torch/utils.py
  20. 8
      ml-agents/mlagents/trainers/trajectory.py

2
Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta


guid: 7f11f35191533404c9957443a681aaee
ScriptedImporter:
fileIDToRecycleName:
11400002: Assets/ML-Agents/Examples/Pushblock/Demos/ExpertPush.demo
11400000: Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo
externalObjects: {}
userData: ' (Unity.MLAgents.Demonstrations.DemonstrationSummary)'
assetBundleName:

16
ml-agents-envs/mlagents_envs/base_env.py


respectively.
"""
def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
if continuous.dtype != np.float32:
def __init__(
self,
continuous: Optional[np.ndarray] = None,
discrete: Optional[np.ndarray] = None,
):
if continuous is not None and continuous.dtype != np.float32:
if discrete.dtype != np.int32:
if discrete is not None and discrete.dtype != np.int32:
discrete = discrete.astype(np.int32, copy=False)
self._discrete = discrete

continuous = np.random.uniform(
low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
)
discrete = np.array([])
discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
if self.discrete_size > 0:
discrete = np.column_stack(
[

for the correct number of agents and ensures the type.
"""
_expected_shape = (n_agents, self.continuous_size)
if actions.continuous.shape != _expected_shape:
if self.continuous_size > 0 and actions.continuous.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a continuous input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

if actions.discrete.shape != _expected_shape:
if self.discrete_size > 0 and actions.discrete.shape != _expected_shape:
raise UnityActionException(
f"The behavior {name} needs a discrete input of dimension "
f"{_expected_shape} for (<number of agents>, <action size>) but "

4
ml-agents-envs/mlagents_envs/environment.py


)
) from ie
if action_spec.continuous_size > 0:
self._env_actions[behavior_name].continuous[index] = action.continuous[0]
self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
self._env_actions[behavior_name].discrete[index] = action.discrete[0]
self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
def get_steps(
self, behavior_name: BehaviorName

9
ml-agents-envs/mlagents_envs/tests/test_envs.py


import pytest
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
from mlagents_envs.base_env import DecisionSteps, TerminalSteps, ActionTuple
from mlagents_envs.exception import UnityEnvironmentException, UnityActionException
from mlagents_envs.mock_communicator import MockCommunicator

env.step()
with pytest.raises(UnityActionException):
env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
n_agents = len(decision_steps)
_empty_act = spec.action_spec.empty_action(n_agents)
next_action = ActionTuple(_empty_act.continuous - 1, _empty_act.discrete - 1)
env.set_actions("RealFakeBrain", next_action)
env.step()
env.close()
assert isinstance(decision_steps, DecisionSteps)
assert isinstance(terminal_steps, TerminalSteps)

7
ml-agents/mlagents/trainers/agent_processor.py


action_probs[prob_type] = prob_array[idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])
prev_action_dict: Dict[str, np.ndarray] = {}
for _prev_act_type, _prev_act in prev_action.items():
prev_action_dict[_prev_act_type] = _prev_act[0, :]
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(
obs=obs,
reward=step.reward,

action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action_dict,
prev_action=prev_action,
interrupted=interrupted,
memory=memory,
)

8
ml-agents/mlagents/trainers/env_manager.py


return len(step_infos)
@staticmethod
def action_buffers_from_numpy_dict(
action_dict: Dict[str, np.ndarray]
) -> ActionTuple:
continuous: np.ndarray = np.array([], dtype=np.float32)
discrete: np.ndarray = np.array([], dtype=np.int32)
def action_tuple_from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> ActionTuple:
continuous: np.ndarray = None
discrete: np.ndarray = None
if "continuous_action" in action_dict:
continuous = action_dict["continuous_action"]
if "discrete_action" in action_dict:

2
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


]
feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = batch["prev_discrete_action"]
feed_dict[self.policy.prev_action] = batch["prev_action"]
if self.policy.use_recurrent:
value_estimates, policy_mem, value_mem = self.sess.run(

36
ml-agents/mlagents/trainers/policy/policy.py


1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
self.previous_action_dict: Dict[str, Dict[str, np.ndarray]] = {}
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents: int) -> Dict[str, np.ndarray]:
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
:return: Dict of action type to np.ndarray
:return: Numpy array of zeros.
act_dict: Dict[str, np.ndarray] = {}
action_tuple = self.behavior_spec.action_spec.empty_action(num_agents)
if self.behavior_spec.action_spec.continuous_size > 0:
act_dict["continuous_action"] = action_tuple.continuous
if self.behavior_spec.action_spec.discrete_size > 0:
act_dict["discrete_action"] = action_tuple.discrete
return act_dict
return np.zeros(
(num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
)
if action_dict is None:
if action_dict is None or "discrete_action" not in action_dict:
agent_action_dict: Dict[str, np.ndarray] = {}
for act_type in action_dict:
agent_action_dict[act_type] = action_dict[act_type][index, :]
self.previous_action_dict[agent_id] = agent_action_dict
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
def retrieve_previous_action(self, agent_ids: List[str]) -> Dict[str, np.ndarray]:
action_dict = self.make_empty_previous_action(len(agent_ids))
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))
for act_type in action_dict:
action_dict[act_type][index, :] = self.previous_action_dict[
agent_id
][act_type]
return action_dict
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:

2
ml-agents/mlagents/trainers/policy/tf_policy.py


if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)["discrete_action"]
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)

9
ml-agents/mlagents/trainers/policy/torch_policy.py


action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = action.to_numpy_dict()
action_dict = action.to_numpy_dict()
run_out["action"] = action_dict
action.to_numpy_dict()["continuous_action"]
if self.action_spec.continuous_size > 0
else None
) # Todo - make pre_action difference
action_dict["continuous_action"] if self.use_continuous_act else None
)
run_out["log_probs"] = log_probs.to_numpy_dict()
run_out["entropy"] = ModelUtils.to_numpy(entropy)
run_out["value_heads"] = {

8
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


else:
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch[
"prev_continuous_action"
]
feed_dict[self.policy.prev_action] = mini_batch[
"prev_discrete_action"
]
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]

2
ml-agents/mlagents/trainers/sac/optimizer_tf.py


else:
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_discrete_action"]
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]

2
ml-agents/mlagents/trainers/simple_env_manager.py


self.previous_all_action_info = all_action_info
for brain_name, action_info in all_action_info.items():
_action = EnvManager.action_buffers_from_numpy_dict(action_info.action)
_action = EnvManager.action_tuple_from_numpy_dict(action_info.action)
self.env.set_actions(brain_name, _action)
self.env.step()
all_step_result = self._generate_all_results()

2
ml-agents/mlagents/trainers/subprocess_env_manager.py


all_action_info = req.payload
for brain_name, action_info in all_action_info.items():
if len(action_info.action) != 0:
_action = EnvManager.action_buffers_from_numpy_dict(
_action = EnvManager.action_tuple_from_numpy_dict(
action_info.action
)
env.set_actions(brain_name, _action)

6
ml-agents/mlagents/trainers/tests/mock_brain.py


if action_spec.is_discrete()
else None
)
if action_spec.is_continuous():
prev_action = {"continuous_action": np.ones(action_size, dtype=np.float32)}
if action_spec.is_discrete():
prev_action = np.ones(action_size, dtype=np.int32)
prev_action = {"discrete_action": np.ones(action_size, dtype=np.float32)}
prev_action = np.ones(action_size, dtype=np.float32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)

4
ml-agents/mlagents/trainers/tests/test_agent_processor.py


mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = {
"prev_continuous_action": np.zeros((1, 1), dtype=np.float32)
}
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

2
ml-agents/mlagents/trainers/tests/test_trajectory.py


"continuous_action",
"action_probs",
"action_mask",
"prev_continuous_action",
"prev_action",
"environment_rewards",
]
wanted_keys = set(wanted_keys)

4
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


]
action_buffer = behavior_spec.action_spec.random_action(1)
action = {}
if action_buffer.continuous is not None:
if behavior_spec.action_spec.continuous_size > 0:
if action_buffer.discrete is not None:
if behavior_spec.action_spec.discrete_size > 0:
action["discrete_action"] = action_buffer.discrete
for _ in range(number):

4
ml-agents/mlagents/trainers/torch/utils.py


"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
discrete_list: Optional[List[torch.Tensor]]
@property
def discrete_tensor(self):

"""
continuous_tensor: torch.Tensor
discrete_list: List[torch.Tensor]
discrete_list: Optional[List[torch.Tensor]]
all_discrete_list: Optional[List[torch.Tensor]]
@property

8
ml-agents/mlagents/trainers/trajectory.py


action_probs: Dict[str, np.ndarray]
action_pre: np.ndarray # TODO: Remove this
action_mask: np.ndarray
prev_action: Dict[str, np.ndarray]
prev_action: np.ndarray
interrupted: bool
memory: np.ndarray

agent_buffer_trajectory["actions_pre"].append(exp.action_pre)
# Adds the log prob and action of continuous/discrete separately
action_shape = None
for act_type, act_array in exp.action.items():
agent_buffer_trajectory[act_type].append(act_array)
for log_type, log_array in exp.action_probs.items():

agent_buffer_trajectory["action_mask"].append(
np.ones(action_shape, dtype=np.float32), padding_value=1
)
for act_type, act_array in exp.prev_action.items():
agent_buffer_trajectory["prev_" + act_type].append(act_array)
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
# Store the next visual obs as the current

正在加载...
取消
保存