|
|
|
|
|
|
from tensorflow.python.framework import graph_util |
|
|
|
from mlagents.trainers import tensorflow_to_barracuda as tf2bc |
|
|
|
from mlagents.trainers.trajectory import SplitObservations |
|
|
|
from mlagents.trainers.buffer import AgentBuffer |
|
|
|
from mlagents.trainers.env_manager import get_global_agent_id |
|
|
|
from mlagents_envs.base_env import BatchedStepResult |
|
|
|
|
|
|
|
|
|
|
self.model.update_normalization, |
|
|
|
feed_dict={self.model.vector_in: vector_obs}, |
|
|
|
) |
|
|
|
|
|
|
|
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]: |
|
|
|
feed_dict: Dict[tf.Tensor, Any] = { |
|
|
|
self.model.batch_size: batch.num_experiences, |
|
|
|
self.model.sequence_length: 1, # We want to feed data in batch-wise, not time-wise. |
|
|
|
} |
|
|
|
|
|
|
|
if self.use_vec_obs: |
|
|
|
feed_dict[self.model.vector_in] = batch["vector_obs"] |
|
|
|
if self.model.vis_obs_size > 0: |
|
|
|
for i in range(len(self.model.visual_in)): |
|
|
|
_obs = batch["visual_obs%d" % i] |
|
|
|
feed_dict[self.model.visual_in[i]] = _obs |
|
|
|
if self.use_recurrent: |
|
|
|
feed_dict[self.model.memory_in] = batch["memory"] |
|
|
|
if not self.use_continuous_act and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = batch["prev_action"] |
|
|
|
value_estimates = self.sess.run(self.model.value_heads, feed_dict) |
|
|
|
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()} |
|
|
|
|
|
|
|
return value_estimates |
|
|
|
|
|
|
|
def get_value_estimates( |
|
|
|
self, next_obs: List[np.ndarray], agent_id: str, done: bool |
|
|
|
) -> Dict[str, float]: |
|
|
|
""" |
|
|
|
Generates value estimates for bootstrapping. |
|
|
|
:param experience: AgentExperience to be used for bootstrapping. |
|
|
|
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0. |
|
|
|
:return: The value estimate dictionary with key being the name of the reward signal and the value the |
|
|
|
corresponding value estimate. |
|
|
|
""" |
|
|
|
|
|
|
|
feed_dict: Dict[tf.Tensor, Any] = { |
|
|
|
self.model.batch_size: 1, |
|
|
|
self.model.sequence_length: 1, |
|
|
|
} |
|
|
|
vec_vis_obs = SplitObservations.from_observations(next_obs) |
|
|
|
for i in range(len(vec_vis_obs.visual_observations)): |
|
|
|
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]] |
|
|
|
|
|
|
|
if self.use_vec_obs: |
|
|
|
feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations] |
|
|
|
if self.use_recurrent: |
|
|
|
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id]) |
|
|
|
if not self.use_continuous_act and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = self.retrieve_previous_action( |
|
|
|
[agent_id] |
|
|
|
) |
|
|
|
value_estimates = self.sess.run(self.model.value_heads, feed_dict) |
|
|
|
|
|
|
|
value_estimates = {k: float(v) for k, v in value_estimates.items()} |
|
|
|
|
|
|
|
# If we're done, reassign all of the value estimates that need terminal states. |
|
|
|
if done: |
|
|
|
for k in value_estimates: |
|
|
|
if self.reward_signals[k].use_terminal_states: |
|
|
|
value_estimates[k] = 0.0 |
|
|
|
|
|
|
|
return value_estimates |
|
|
|
|
|
|
|
@property |
|
|
|
def vis_obs_size(self): |
|
|
|