浏览代码

Modification of reward signals and rl_trainer for SAC (#2433)

* Adds evaluate_batch to reward signals. Evaluates on minibatch rather than on BrainInfo.
* Changes the way reward signal results are reported in rl_trainer so that we get the pure,  unprocessed environment reward separate from the reward signals.
* Moves end_episode to rl_trainer
* Fixed bug with BCModule with RNN
/develop-gpu-test
GitHub 5 年前
当前提交
689765d6
共有 11 个文件被更改,包括 215 次插入118 次删除
  1. 2
      ml-agents/mlagents/trainers/components/bc/module.py
  2. 60
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  3. 9
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  4. 2
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  5. 57
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  6. 15
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py
  7. 22
      ml-agents/mlagents/trainers/ppo/trainer.py
  8. 53
      ml-agents/mlagents/trainers/rl_trainer.py
  9. 57
      ml-agents/mlagents/trainers/tests/mock_brain.py
  10. 47
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  11. 9
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py

2
ml-agents/mlagents/trainers/components/bc/module.py


else:
feed_dict[self.policy.model.action_masks] = np.ones(
(
self.n_sequences,
self.n_sequences * self.policy.sequence_length,
sum(self.policy.model.brain.vector_action_space_size),
)
)

60
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from typing import Any, Dict, List
import numpy as np
import tensorflow as tf
import tensorflow as tf
from mlagents.trainers.buffer import Buffer
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult

:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
if len(current_info.agents) == 0:
return []
return RewardSignalResult([], [])
mini_batch: Dict[str, np.array] = {}
# Construct the batch and use evaluate_batch
mini_batch["actions"] = next_info.previous_vector_actions
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
for i in range(len(current_info.visual_observations)):
mini_batch["visual_obs%d" % i] = current_info.visual_observations[i]
mini_batch["next_visual_obs%d" % i] = next_info.visual_observations[i]
if self.policy.use_vec_obs:
mini_batch["vector_obs"] = current_info.vector_observations
mini_batch["next_vector_in"] = next_info.vector_observations
result = self.evaluate_batch(mini_batch)
return result
feed_dict = {
self.policy.model.batch_size: len(next_info.vector_observations),
self.policy.model.sequence_length: 1,
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.model.batch_size: len(mini_batch["actions"]),
self.policy.model.sequence_length: self.policy.sequence_length,
feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
if self.policy.use_vec_obs:
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
if self.policy.model.vis_obs_size > 0:
for i in range(len(self.policy.model.visual_in)):
_obs = mini_batch["visual_obs%d" % i]
_next_obs = mini_batch["next_visual_obs%d" % i]
feed_dict[self.policy.model.visual_in[i]] = _obs
feed_dict[self.model.next_visual_in[i]] = _next_obs
feed_dict[
self.policy.model.selected_actions
] = next_info.previous_vector_actions
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
feed_dict[
self.policy.model.action_holder
] = next_info.previous_vector_actions
for i in range(self.policy.model.vis_obs_size):
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.policy.use_vec_obs:
feed_dict[self.model.next_vector_in] = next_info.vector_observations
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

policy_model.batch_size: num_sequences,
policy_model.sequence_length: self.policy.sequence_length,
policy_model.mask_input: mini_batch["masks"],
policy_model.advantage: mini_batch["advantages"],
policy_model.all_old_log_probs: mini_batch["action_probs"],
}
if self.policy.use_continuous_act:
feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]

feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
if policy_model.vis_obs_size > 0:
for i, _ in enumerate(policy_model.visual_in):
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
for i, _ in enumerate(policy_model.visual_in):
feed_dict[self.model.next_visual_in[i]] = mini_batch[
"next_visual_obs%d" % i
]
for i, vis_in in enumerate(policy_model.visual_in):
feed_dict[vis_in] = mini_batch["visual_obs%d" % i]
for i, next_vis_in in enumerate(self.model.next_visual_in):
feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i]
self.has_updated = True
return feed_dict

9
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
env_rews = mini_batch["environment_rewards"]
return RewardSignalResult(self.strength * env_rews, env_rews)
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo
) -> RewardSignalResult:

unscaled_reward = np.array(next_info.rewards)
scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"])
return RewardSignalResult(self.strength * env_rews, env_rews)

2
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


self.done_policy,
reuse=True,
)
self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
self.discriminator_score = tf.reshape(
self.policy_estimate, [-1], name="gail_reward"
)

57
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


self.update_dict: Dict[str, tf.Tensor] = {
"gail_loss": self.model.loss,
"gail_update_batch": self.model.update_batch,
"gail_policy_estimate": self.model.policy_estimate,
"gail_expert_estimate": self.model.expert_estimate,
"gail_policy_estimate": self.model.mean_policy_estimate,
"gail_expert_estimate": self.model.mean_expert_estimate,
}
if self.model.use_vail:
self.update_dict["kl_loss"] = self.model.kl_loss

self.update_dict["beta_update"] = self.model.update_beta
self.stats_name_to_update_name = {"Losses/GAIL Loss": "gail_loss"}
self.stats_name_to_update_name = {
"Losses/GAIL Loss": "gail_loss",
"Policy/GAIL Policy Estimate": "gail_policy_estimate",
"Policy/GAIL Expert Estimate": "gail_expert_estimate",
}
return []
return RewardSignalResult([], [])
mini_batch: Dict[str, np.array] = {}
# Construct the batch
mini_batch["actions"] = next_info.previous_vector_actions
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
for i, obs in enumerate(current_info.visual_observations):
mini_batch["visual_obs%d" % i] = obs
if self.policy.use_vec_obs:
mini_batch["vector_obs"] = current_info.vector_observations
result = self.evaluate_batch(mini_batch)
return result
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
self.policy.model.batch_size: len(next_info.vector_observations),
self.policy.model.sequence_length: 1,
self.policy.model.batch_size: len(mini_batch["actions"]),
self.policy.model.sequence_length: self.policy.sequence_length,
feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1])
if self.policy.use_vec_obs:
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
if self.policy.model.vis_obs_size > 0:
for i in range(len(self.policy.model.visual_in)):
_obs = mini_batch["visual_obs%d" % i]
feed_dict[self.policy.model.visual_in[i]] = _obs
feed_dict[
self.policy.model.selected_actions
] = next_info.previous_vector_actions
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
feed_dict[
self.policy.model.action_holder
] = next_info.previous_vector_actions
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
feed_dict[self.model.done_policy_holder] = np.array(
mini_batch["done"]
).flatten()
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch_policy.items():
mini_batch_policy[key] = element[:max_num_experiences]
# Get demo buffer
self.demonstration_buffer.update_buffer.shuffle(1)
# TODO: Replace with SAC sample method
mini_batch_demo = self.demonstration_buffer.update_buffer.make_mini_batch(
0, len(mini_batch_policy["actions"])
# Get batch from demo buffer
mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
len(mini_batch_policy["actions"]), 1
)
feed_dict: Dict[tf.Tensor, Any] = {

15
ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py


np.zeros(len(current_info.agents)),
)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
"""
Evaluates the reward for the data present in the Dict mini_batch. Note the distiction between
evaluate(), which takes in two BrainInfos. This reflects the different data formats (i.e. from the Buffer
vs. before being placed into the Buffer. Use this when evaluating a reward function drawn straight from a
Buffer.
:param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
when drawing from the update buffer.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
mini_batch_len = len(next(iter(mini_batch.values())))
return RewardSignalResult(
self.strength * np.zeros(mini_batch_len), np.zeros(mini_batch_len)
)
def prepare_update(
self,
policy_model: LearningModel,

22
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.policy import PPOPolicy
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput
from mlagents.trainers.components.reward_signals import RewardSignalResult
from mlagents.envs.action_info import ActionInfoOutputs

def add_rewards_outputs(
self,
value: Dict[str, Any],
rewards_dict: Dict[str, RewardSignalResult],
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,

"""
for name, reward_result in rewards_dict.items():
for name, reward_result in rewards_out.reward_signals.items():
value[name][agent_next_idx][0]
values[name][agent_next_idx][0]
def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()
for agent_id in self.episode_steps:
self.episode_steps[agent_id] = 0
for rewards in self.collected_rewards.values():
for agent_id in rewards:
rewards[agent_id] = 0
def is_ready_update(self):
"""

53
ml-agents/mlagents/trainers/rl_trainer.py


# # Unity ML-Agents Toolkit
import logging
from typing import Dict, List, Deque, Any
from typing import Dict, List, Deque, Any, Optional, NamedTuple
import os
import tensorflow as tf
import numpy as np

from mlagents.trainers.buffer import Buffer
from mlagents.trainers.tf_policy import Policy
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.components.reward_signals.reward_signal import RewardSignalResult
RewardSignalResults = Dict[str, RewardSignalResult]
class AllRewardsOutput(NamedTuple):
"""
This class stores all of the outputs of the reward signals,
as well as the raw reward from the environment.
"""
reward_signals: RewardSignalResults
environment: np.ndarray
class RLTrainer(Trainer):

else:
curr_to_use = curr_info
tmp_rewards_dict = {}
# Evaluate and store the reward signals
tmp_reward_signal_outs = {}
tmp_rewards_dict[name] = signal.evaluate(curr_to_use, next_info)
tmp_reward_signal_outs[name] = signal.evaluate(curr_to_use, next_info)
# Store the environment reward
tmp_environment = np.array(next_info.rewards)
rewards_out = AllRewardsOutput(
reward_signals=tmp_reward_signal_outs, environment=tmp_environment
)
for agent_id in next_info.agents:
stored_info = self.training_buffer[agent_id].last_brain_info

)
values = stored_take_action_outputs["value_heads"]
values, tmp_rewards_dict, agent_id, idx, next_idx
rewards_out, values, agent_id, idx, next_idx
)
for name, rewards in self.collected_rewards.items():

# Report the reward from the environment
rewards[agent_id] += np.array(next_info.rewards)[next_idx]
rewards[agent_id] += rewards_out.environment[next_idx]
rewards[agent_id] += tmp_rewards_dict[name].scaled_reward[
next_idx
]
rewards[agent_id] += rewards_out.reward_signals[
name
].scaled_reward[next_idx]
if not next_info.local_done[next_idx]:
if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0

def end_episode(self) -> None:
"""
A signal that the Episode has ended. The buffer must be reset.
Get only called when the academy resets.
"""
self.training_buffer.reset_local_buffers()
for agent_id in self.episode_steps:
self.episode_steps[agent_id] = 0
for rewards in self.collected_rewards.values():
for agent_id in rewards:
rewards[agent_id] = 0
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:

def add_rewards_outputs(
self,
value: Dict[str, Any],
rewards_dict: Dict[str, float],
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,

57
ml-agents/mlagents/trainers/tests/mock_brain.py


return buffer
def create_buffer(brain_infos, brain_params, sequence_length):
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
buffer = Buffer()
# Make a buffer
for idx, experience in enumerate(brain_infos):

buffer[0]["random_normal_epsilon"].append(
np.ones(buffer[0]["actions"][0].shape)
)
buffer[0]["action_mask"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["memory"].append(np.ones(8))
buffer[0]["action_mask"].append(
np.ones(np.sum(brain_params.vector_action_space_size))
)
buffer[0]["memory"].append(np.ones(memory_size))
def setup_mock_env_and_brains(
mock_env,
use_discrete,
use_visual,
num_agents=12,
discrete_action_space=[3, 3, 3, 2],
vector_action_space=[2],
vector_obs_space=8,
):
if not use_visual:
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=discrete_action_space
if use_discrete
else vector_action_space,
vector_observation_space_size=vector_obs_space,
)
mock_braininfo = create_mock_braininfo(
num_agents=num_agents,
num_vector_observations=vector_obs_space,
num_vector_acts=sum(
discrete_action_space if use_discrete else vector_action_space
),
discrete=use_discrete,
num_discrete_branches=len(discrete_action_space),
)
else:
mock_brain = create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=discrete_action_space
if use_discrete
else vector_action_space,
vector_observation_space_size=0,
number_visual_observations=1,
)
mock_braininfo = create_mock_braininfo(
num_agents=num_agents,
num_vis_observations=1,
num_vector_acts=sum(
discrete_action_space if use_discrete else vector_action_space
),
discrete=use_discrete,
num_discrete_branches=len(discrete_action_space),
)
setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
env = mock_env()
return env, mock_brain, mock_braininfo
def create_mock_3dball_brain():

47
ml-agents/mlagents/trainers/tests/test_reward_signals.py


def create_ppo_policy_mock(
mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual
):
if not use_visual:
mock_brain = mb.create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_observation_space_size=VECTOR_OBS_SPACE,
)
mock_braininfo = mb.create_mock_braininfo(
num_agents=NUM_AGENTS,
num_vector_observations=VECTOR_OBS_SPACE,
num_vector_acts=sum(
DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE
),
discrete=use_discrete,
num_discrete_branches=len(DISCRETE_ACTION_SPACE),
)
else:
mock_brain = mb.create_mock_brainparams(
vector_action_space_type="discrete" if use_discrete else "continuous",
vector_action_space_size=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_observation_space_size=0,
number_visual_observations=1,
)
mock_braininfo = mb.create_mock_braininfo(
num_agents=NUM_AGENTS,
num_vis_observations=1,
num_vector_acts=sum(
DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE
),
discrete=use_discrete,
num_discrete_branches=len(DISCRETE_ACTION_SPACE),
)
mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)
env = mock_env()
env, mock_brain, _ = mb.setup_mock_env_and_brains(
mock_env,
use_discrete,
use_visual,
num_agents=NUM_AGENTS,
vector_action_space=VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
discrete_action_space=DISCRETE_ACTION_SPACE,
)
trainer_parameters = dummy_config
model_path = env.brain_names[0]

9
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


# assert construct_curr_info worked properly
assert len(brain_info.agents) == 1
# Test end episode
trainer.end_episode()
for agent_id in trainer.episode_steps:
assert trainer.episode_steps[agent_id] == 0
assert len(trainer.training_buffer[agent_id]["action"]) == 0
for rewards in trainer.collected_rewards.values():
for agent_id in rewards:
assert rewards[agent_id] == 0
正在加载...
取消
保存