浏览代码

Change reward signal creation

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
abc98c23
共有 5 个文件被更改,包括 21 次插入33 次删除
  1. 9
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  2. 23
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  3. 16
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  4. 4
      ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py
  5. 2
      ml-agents/mlagents/trainers/optimizer.py

9
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


class RewardSignal(abc.ABC):
def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
strength: float,
gamma: float,
):
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
"""
Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
the reward strength, and the gamma (discount factor.)

self.update_dict: Dict[str, tf.Tensor] = {}
self.gamma = gamma
self.policy = policy
self.policy_model = policy_model
self.strength = strength
self.stats_name_to_update_name: Dict[str, str] = {}

23
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
strength: float,
gamma: float,
encoding_size: int = 128,

:param encoding_size: The size of the hidden encoding layer for the ICM
:param learning_rate: The learning rate for the ICM.
"""
super().__init__(policy, policy_model, strength, gamma)
super().__init__(policy, strength, gamma)
policy_model, encoding_size=encoding_size, learning_rate=learning_rate
policy, encoding_size=encoding_size, learning_rate=learning_rate
)
self.use_terminal_states = False
self.update_dict = {

def prepare_update(
self,
policy_model: LearningModel,
policy: TFPolicy,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
) -> Dict[tf.Tensor, Any]:

:return: Feed_dict needed for update.
"""
feed_dict = {
policy_model.batch_size: num_sequences,
policy_model.sequence_length: self.policy.sequence_length,
policy_model.mask_input: mini_batch["masks"],
policy.batch_size: num_sequences,
policy.sequence_length: self.policy.sequence_length,
policy.mask_input: mini_batch["masks"],
feed_dict[policy_model.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy_model.action_holder] = mini_batch["actions"]
feed_dict[policy.action_holder] = mini_batch["actions"]
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
if policy_model.vis_obs_size > 0:
for i, vis_in in enumerate(policy_model.visual_in):
if policy.vis_obs_size > 0:
for i, vis_in in enumerate(policy.visual_in):
feed_dict[vis_in] = mini_batch["visual_obs%d" % i]
for i, next_vis_in in enumerate(self.model.next_visual_in):
feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i]

16
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
from .model import GAILModel
from mlagents.trainers.demo_loader import demo_to_buffer

def __init__(
self,
policy: TFPolicy,
policy_model: LearningModel,
strength: float,
gamma: float,
demo_path: str,

:param use_vail: Whether or not to use a variational bottleneck for the discriminator.
See https://arxiv.org/abs/1810.00821.
"""
super().__init__(policy, policy_model, strength, gamma)
super().__init__(policy, strength, gamma)
self.use_terminal_states = False
self.model = GAILModel(

def prepare_update(
self,
policy_model: LearningModel,
policy: TFPolicy,
mini_batch: Dict[str, np.ndarray],
num_sequences: int,
) -> Dict[tf.Tensor, Any]:

feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
if self.policy.use_continuous_act:
feed_dict[policy_model.selected_actions] = mini_batch["actions"]
feed_dict[policy.selected_actions] = mini_batch["actions"]
feed_dict[policy_model.action_holder] = mini_batch["actions"]
feed_dict[policy.action_holder] = mini_batch["actions"]
for i in range(len(policy_model.visual_in)):
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
for i in range(len(policy.visual_in)):
feed_dict[policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
feed_dict[policy.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
self.has_updated = True
return feed_dict

4
ml-agents/mlagents/trainers/components/reward_signals/reward_signal_factory.py


CuriosityRewardSignal,
)
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.models import LearningModel
logger = logging.getLogger("mlagents.trainers")

def create_reward_signal(
policy: TFPolicy,
policy_model: LearningModel,
name: str,
config_entry: Dict[str, Any],
) -> RewardSignal:

raise UnityTrainerException("Unknown reward signal type {0}".format(name))
rcls.check_config(config_entry)
try:
class_inst = rcls(policy, policy_model, **config_entry)
class_inst = rcls(policy, **config_entry)
except TypeError:
raise UnityTrainerException(
"Unknown parameters given for reward signal {0}".format(name)

2
ml-agents/mlagents/trainers/optimizer.py


# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.policy, reward_signal, config
self.policy, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)

正在加载...
取消
保存