def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
n_obs = len(self.policy.behavior_spec.observation_spec)
n_obs = len(self.policy.behavior_spec.observation_specs)
current_obs = ObsUtil.from_buffer(batch, n_obs)
# Convert to tensors
else:
ac_class = SharedActorCritic
self.actor_critic = ac_class(
observation_spec=self.behavior_spec.observation_spec,
observation_spec=self.behavior_spec.observation_specs,
network_settings=trainer_settings.network_settings,
action_spec=behavior_spec.action_spec,
stream_names=reward_signal_names,
for name in self.reward_signals:
rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])
current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
) -> AgentBuffer:
trajectory = make_fake_trajectory(
length,
behavior_spec.observation_spec,
behavior_spec.observation_specs,
memory_size=memory_size,
)
time_horizon = 15
length=time_horizon,
observation_spec=optimizer.policy.behavior_spec.observation_spec,
observation_spec=optimizer.policy.behavior_spec.observation_specs,
action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
max_step_complete=True,
Helper function for update_batch.
"""
np_obs = ObsUtil.from_buffer(
mini_batch_demo, len(self.policy.behavior_spec.observation_spec)
mini_batch_demo, len(self.policy.behavior_spec.observation_specs)
tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
dummy_vec_obs = [torch.zeros(batch_dim + [vec_obs_size])]
# create input shape of NCHW
# (It's NHWC in self.policy.behavior_spec.observation_spec.shape)
# (It's NHWC in self.policy.behavior_spec.observation_specs.shape)
for obs_spec in self.policy.behavior_spec.observation_spec
for obs_spec in self.policy.behavior_spec.observation_specs
if len(obs_spec.shape) == 3
]
dummy_masks = torch.ones(