|
|
|
|
|
|
import logging |
|
|
|
import numpy as np |
|
|
|
from typing import Any, Dict, Optional, List |
|
|
|
from typing import Any, Dict, Optional |
|
|
|
|
|
|
|
from mlagents.tf_utils import tf |
|
|
|
|
|
|
|
|
|
|
from mlagents.trainers.trajectory import split_obs |
|
|
|
from mlagents.trainers.buffer import AgentBuffer |
|
|
|
from mlagents.trainers.components.reward_signals.reward_signal_factory import ( |
|
|
|
create_reward_signal, |
|
|
|
) |
|
|
|
|
|
|
] |
|
|
|
feed_dict[model.memory_in] = mem_in |
|
|
|
return feed_dict |
|
|
|
|
|
|
|
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]: |
|
|
|
feed_dict: Dict[tf.Tensor, Any] = { |
|
|
|
self.model.batch_size: batch.num_experiences, |
|
|
|
self.model.sequence_length: 1, # We want to feed data in batch-wise, not time-wise. |
|
|
|
} |
|
|
|
|
|
|
|
if self.use_vec_obs: |
|
|
|
feed_dict[self.model.vector_in] = batch["vector_obs"] |
|
|
|
if self.model.vis_obs_size > 0: |
|
|
|
for i in range(len(self.model.visual_in)): |
|
|
|
_obs = batch["visual_obs%d" % i] |
|
|
|
feed_dict[self.model.visual_in[i]] = _obs |
|
|
|
if self.use_recurrent: |
|
|
|
feed_dict[self.model.memory_in] = batch["memory"] |
|
|
|
if not self.use_continuous_act and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = batch["prev_action"] |
|
|
|
value_estimates = self.sess.run(self.model.value_heads, feed_dict) |
|
|
|
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()} |
|
|
|
|
|
|
|
return value_estimates |
|
|
|
|
|
|
|
def get_value_estimates( |
|
|
|
self, next_obs: List[np.ndarray], agent_id: str, done: bool |
|
|
|
) -> Dict[str, float]: |
|
|
|
""" |
|
|
|
Generates value estimates for bootstrapping. |
|
|
|
:param experience: AgentExperience to be used for bootstrapping. |
|
|
|
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0. |
|
|
|
:return: The value estimate dictionary with key being the name of the reward signal and the value the |
|
|
|
corresponding value estimate. |
|
|
|
""" |
|
|
|
|
|
|
|
feed_dict: Dict[tf.Tensor, Any] = { |
|
|
|
self.model.batch_size: 1, |
|
|
|
self.model.sequence_length: 1, |
|
|
|
} |
|
|
|
vec_vis_obs = split_obs(next_obs) |
|
|
|
for i in range(len(vec_vis_obs.visual_observations)): |
|
|
|
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]] |
|
|
|
|
|
|
|
if self.use_vec_obs: |
|
|
|
feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations] |
|
|
|
if self.use_recurrent: |
|
|
|
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id]) |
|
|
|
if not self.use_continuous_act and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = self.retrieve_previous_action( |
|
|
|
[agent_id] |
|
|
|
) |
|
|
|
value_estimates = self.sess.run(self.model.value_heads, feed_dict) |
|
|
|
|
|
|
|
value_estimates = {k: float(v) for k, v in value_estimates.items()} |
|
|
|
|
|
|
|
# If we're done, reassign all of the value estimates that need terminal states. |
|
|
|
if done: |
|
|
|
for k in value_estimates: |
|
|
|
if self.reward_signals[k].use_terminal_states: |
|
|
|
value_estimates[k] = 0.0 |
|
|
|
|
|
|
|
return value_estimates |