浏览代码

save only discrete actions as prev

/develop/action-spec-gym
Andrew Cohen 4 年前
当前提交
3457cd3c
共有 10 个文件被更改,包括 26 次插入49 次删除
  1. 7
      ml-agents/mlagents/trainers/agent_processor.py
  2. 2
      ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
  3. 34
      ml-agents/mlagents/trainers/policy/policy.py
  4. 2
      ml-agents/mlagents/trainers/policy/tf_policy.py
  5. 8
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  6. 2
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  7. 7
      ml-agents/mlagents/trainers/tests/mock_brain.py
  8. 4
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  9. 2
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  10. 7
      ml-agents/mlagents/trainers/trajectory.py

7
ml-agents/mlagents/trainers/agent_processor.py


action_probs[prob_type] = prob_array[idx]
action_mask = stored_decision_step.action_mask
prev_action = self.policy.retrieve_previous_action([global_id])
prev_action_dict: Dict[str, np.ndarray] = {}
for _prev_act_type, _prev_act in prev_action.items():
prev_action_dict[_prev_act_type] = _prev_act[0, :]
prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
experience = AgentExperience(
obs=obs,
reward=step.reward,

action_pre=action_pre,
action_mask=action_mask,
prev_action=prev_action_dict,
prev_action=prev_action,
interrupted=interrupted,
memory=memory,
)

2
ml-agents/mlagents/trainers/optimizer/tf_optimizer.py


]
feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = batch["prev_discrete_action"]
feed_dict[self.policy.prev_action] = batch["prev_action"]
if self.policy.use_recurrent:
value_estimates, policy_mem, value_mem = self.sess.run(

34
ml-agents/mlagents/trainers/policy/policy.py


1 for shape in behavior_spec.observation_shapes if len(shape) == 3
)
self.use_continuous_act = self.behavior_spec.action_spec.is_continuous()
self.previous_action_dict: Dict[str, Dict[str, np.ndarray]] = {}
self.previous_action_dict: Dict[str, np.ndarray] = {}
self.memory_dict: Dict[str, np.ndarray] = {}
self.normalize = trainer_settings.network_settings.normalize
self.use_recurrent = self.network_settings.memory is not None

if agent_id in self.memory_dict:
self.memory_dict.pop(agent_id)
def make_empty_previous_action(self, num_agents: int) -> Dict[str, np.ndarray]:
def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
act_dict: Dict[str, np.ndarray] = {}
action_tuple = self.behavior_spec.action_spec.empty_action(num_agents)
if self.behavior_spec.action_spec.continuous_size > 0:
act_dict["continuous_action"] = action_tuple.continuous
if self.behavior_spec.action_spec.discrete_size > 0:
act_dict["discrete_action"] = action_tuple.discrete
return act_dict
return np.zeros(
(num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
)
if action_dict is None:
if action_dict is None or "discrete_action" not in action_dict:
agent_action_dict: Dict[str, np.ndarray] = {}
for act_type in action_dict:
agent_action_dict[act_type] = action_dict[act_type][index, :]
self.previous_action_dict[agent_id] = agent_action_dict
self.previous_action_dict[agent_id] = action_dict["discrete_action"][
index, :
]
def retrieve_previous_action(self, agent_ids: List[str]) -> Dict[str, np.ndarray]:
action_dict = self.make_empty_previous_action(len(agent_ids))
def retrieve_previous_action(self, agent_ids: List[str]) -> np.ndarray:
action_matrix = self.make_empty_previous_action(len(agent_ids))
for act_type in action_dict:
action_dict[act_type][index, :] = self.previous_action_dict[
agent_id
][act_type]
return action_dict
action_matrix[index, :] = self.previous_action_dict[agent_id]
return action_matrix
def remove_previous_action(self, agent_ids):
for agent_id in agent_ids:

2
ml-agents/mlagents/trainers/policy/tf_policy.py


if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)["discrete_action"]
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)

8
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


else:
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch[
"prev_continuous_action"
]
feed_dict[self.policy.prev_action] = mini_batch[
"prev_discrete_action"
]
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]

2
ml-agents/mlagents/trainers/sac/optimizer_tf.py


else:
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_discrete_action"]
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]

7
ml-agents/mlagents/trainers/tests/mock_brain.py


if action_spec.is_discrete()
else None
)
if action_spec.is_continuous():
prev_action = {"continuous_action": np.ones(action_size, dtype=np.float32)}
else:
prev_action = {"discrete_action": np.ones(action_size, dtype=np.float32)}
prev_action = None
if action_spec.is_discrete():
prev_action = np.ones(action_size, dtype=np.int32)
max_step = False
memory = np.ones(memory_size, dtype=np.float32)

4
ml-agents/mlagents/trainers/tests/test_agent_processor.py


mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = {
"prev_continuous_action": np.zeros((1, 1), dtype=np.float32)
}
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1), dtype=np.int32)
return mock_policy

2
ml-agents/mlagents/trainers/tests/test_trajectory.py


"continuous_action",
"action_probs",
"action_mask",
"prev_continuous_action",
"prev_action",
"environment_rewards",
]
wanted_keys = set(wanted_keys)

7
ml-agents/mlagents/trainers/trajectory.py


action_probs: Dict[str, np.ndarray]
action_pre: np.ndarray # TODO: Remove this
action_mask: np.ndarray
prev_action: Dict[str, np.ndarray]
prev_action: np.ndarray
interrupted: bool
memory: np.ndarray

agent_buffer_trajectory["action_mask"].append(
np.ones(action_shape, dtype=np.float32), padding_value=1
)
for act_type, act_array in exp.prev_action.items():
agent_buffer_trajectory["prev_" + act_type].append(act_array)
agent_buffer_trajectory["prev_action"].append(exp.prev_action)
agent_buffer_trajectory["environment_rewards"].append(exp.reward)
# Store the next visual obs as the current

正在加载...
取消
保存