浏览代码

test_simple_rl/reward providers pass tf/torch

/develop/action-spec-gym
Andrew Cohen 4 年前
当前提交
8172b3d6
共有 9 个文件被更改,包括 73 次插入34 次删除
  1. 24
      ml-agents-envs/mlagents_envs/base_env.py
  2. 6
      ml-agents/mlagents/trainers/policy/tf_policy.py
  3. 7
      ml-agents/mlagents/trainers/ppo/optimizer_tf.py
  4. 6
      ml-agents/mlagents/trainers/sac/optimizer_tf.py
  5. 25
      ml-agents/mlagents/trainers/tests/mock_brain.py
  6. 6
      ml-agents/mlagents/trainers/tf/components/bc/module.py
  7. 8
      ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
  8. 15
      ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
  9. 10
      ml-agents/mlagents/trainers/trajectory.py

24
ml-agents-envs/mlagents_envs/base_env.py


Continuous and discrete actions are numpy arrays.
"""
continuous: List[np.ndarray]
discrete: List[np.ndarray]
continuous: np.ndarray # dims (n_agents, cont_size)
discrete: np.ndarray # dims (n_agents, disc_size)
@staticmethod
def from_numpy_dict(action_dict: Dict[str, np.ndarray]) -> "ActionBuffers":

"""
action_dict: Dict[str, np.ndarray] = {}
if self.continuous_size > 0:
action_dict["continuous_action"] = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
action_dict["continuous_action"] = np.zeros(
(n_agents, self.continuous_size), dtype=np.float32
)
action_dict["discrete_action"] = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
action_dict["discrete_action"] = np.zeros(
(n_agents, self.discrete_size), dtype=np.int32
)
# return ActionBuffers(
# np.zeros((n_agents, self.continuous_size), dtype=np.float32),
# np.zeros((n_agents, self.discrete_size), dtype=np.int32),
# )
# return ActionBuffers(
# np.zeros((n_agents, self.continuous_size), dtype=np.float32),
# np.zeros((n_agents, self.discrete_size), dtype=np.int32),
# )
def random_action(self, n_agents: int) -> Dict[str, np.ndarray]:
"""

)
action_dict["discrete_action"] = discrete_action
return action_dict
#return ActionBuffers(continuous_action, discrete_action)
# return ActionBuffers(continuous_action, discrete_action)
def _validate_action(
self, actions: ActionBuffers, n_agents: int, name: str

if actions.discrete.dtype != np.int32:
actions.discrete = actions.discrete.astype(np.int32)
return action
return actions
@staticmethod
def create_continuous(continuous_size: int) -> "ActionSpec":

6
ml-agents/mlagents/trainers/policy/tf_policy.py


)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
# For Compatibility with buffer changes for hybrid action support
run_out["log_probs"] = {"action_probs": run_out["log_probs"]}
if self.behavior_spec.action_spec.is_continuous():
run_out["action"] = {"continuous_action": run_out["action"]}
else:
run_out["action"] = {"discrete_action": run_out["action"]}
return ActionInfo(
action=run_out.get("action"),
value=run_out.get("value"),

7
ml-agents/mlagents/trainers/ppo/optimizer_tf.py


)
stats_needed.update(reward_signal.stats_name_to_update_name)
for tens, d in feed_dict.items():
print(tens, d)
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]

if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_continuous_act: # For hybrid action buffer support
feed_dict[self.policy.output] = mini_batch["continuous_action"]
else:
feed_dict[self.policy.output] = mini_batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]

6
ml-agents/mlagents/trainers/sac/optimizer_tf.py


feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
if self.policy.use_continuous_act:
feed_dict[self.policy_network.external_action_in] = batch["actions"]
feed_dict[self.policy_network.external_action_in] = batch[
"continuous_action"
]
feed_dict[policy.output] = batch["actions"]
feed_dict[policy.output] = batch["discrete_action"]
if self.policy.use_recurrent:
feed_dict[policy.prev_action] = batch["prev_action"]
feed_dict[policy.action_masks] = batch["action_mask"]

25
ml-agents/mlagents/trainers/tests/mock_brain.py


steps_list = []
action_size = action_spec.discrete_size + action_spec.continuous_size
action_probs = np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
action_probs = {
"action_probs": np.ones(
int(np.sum(action_spec.discrete_branches) + action_spec.continuous_size),
dtype=np.float32,
)
}
for _i in range(length - 1):
obs = []
for _shape in observation_shapes:

action = np.zeros(action_size, dtype=np.float32)
if action_spec.is_continuous():
action = {"continuous_action": np.zeros(action_size, dtype=np.float32)}
else:
action = {"discrete_action": np.zeros(action_size, dtype=np.float32)}
action_pre = np.zeros(action_size, dtype=np.float32)
action_mask = (
[

if action_spec.is_discrete()
else None
)
prev_action = np.ones(action_size, dtype=np.float32)
if action_spec.is_continuous():
prev_action = {
"prev_continuous_action": np.ones(action_size, dtype=np.float32)
}
else:
prev_action = {
"prev_discrete_action": np.ones(action_size, dtype=np.float32)
}
max_step = False
memory = np.ones(memory_size, dtype=np.float32)
agent_id = "test_agent"

6
ml-agents/mlagents/trainers/tf/components/bc/module.py


self.policy.batch_size_ph: n_sequences,
self.policy.sequence_length_ph: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
feed_dict[self.model.action_in_expert] = mini_batch_demo["discrete_action"]
feed_dict[self.policy.action_masks] = np.ones(
(
self.n_sequences * self.policy.sequence_length,

)
else:
feed_dict[self.model.action_in_expert] = mini_batch_demo[
"continuous_action"
]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
for i, _ in enumerate(self.policy.visual_in):

8
ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py


feed_dict[self.model.next_visual_in[i]] = _next_obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

policy.mask_input: mini_batch["masks"],
}
if self.policy.use_continuous_act:
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]

15
ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py


feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_continuous_act:
feed_dict[self.policy.selected_actions] = mini_batch["actions"]
feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.policy.output] = mini_batch["actions"]
feed_dict[self.policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()

if self.model.use_vail:
feed_dict[self.model.use_noise] = [1]
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["continuous_action"]
)
feed_dict[policy.output] = mini_batch["actions"]
feed_dict[policy.output] = mini_batch["discrete_action"]
feed_dict[self.model.action_in_expert] = np.array(
mini_batch_demo["discrete_action"]
)
if self.policy.use_vis_obs > 0:
for i in range(len(policy.visual_in)):

10
ml-agents/mlagents/trainers/trajectory.py


from mlagents.trainers.buffer import AgentBuffer
from mlagents_envs.base_env import ActionBuffers
class AgentExperience(NamedTuple):
obs: List[np.ndarray]

if exp.action_pre is not None:
agent_buffer_trajectory["actions_pre"].append(exp.action_pre)
# Adds the log prob and action of continuous/discrete separately
# Adds the log prob and action of continuous/discrete separately
action_shape = None
action_shape = act_array.shape # TODO Better way to make mask
for log_type, log_array in exp.action_probs.items():
agent_buffer_trajectory[log_type].append(log_array)

# This should never be needed unless the environment somehow doesn't supply the
# action mask in a discrete space.
agent_buffer_trajectory["action_mask"].append(
np.ones(exp.action_probs["continuous_log_probs"].shape, dtype=np.float32), padding_value=1
np.ones(action_shape, dtype=np.float32), padding_value=1
#agent_buffer_trajectory["prev_action"].append(exp.prev_action)
# agent_buffer_trajectory["prev_action"].append(exp.prev_action)
for act_type, act_array in exp.prev_action.items():
agent_buffer_trajectory["prev_" + act_type].append(act_array)

正在加载...
取消
保存