浏览代码

Properly report value estimates and episode length

/develop-newnormalization
Ervin Teng 5 年前
当前提交
2b811fc8
共有 5 个文件被更改,包括 101 次插入107 次删除
  1. 64
      ml-agents/mlagents/trainers/ppo/policy.py
  2. 26
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 23
      ml-agents/mlagents/trainers/rl_trainer.py
  4. 33
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 62
      ml-agents/mlagents/trainers/tf_policy.py

64
ml-agents/mlagents/trainers/ppo/policy.py


import logging
import numpy as np
from typing import Any, Dict, Optional, List
from typing import Any, Dict, Optional
from mlagents.tf_utils import tf

from mlagents.trainers.trajectory import split_obs
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)

]
feed_dict[model.memory_in] = mem_in
return feed_dict
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: batch.num_experiences,
self.model.sequence_length: 1, # We want to feed data in batch-wise, not time-wise.
}
if self.use_vec_obs:
feed_dict[self.model.vector_in] = batch["vector_obs"]
if self.model.vis_obs_size > 0:
for i in range(len(self.model.visual_in)):
_obs = batch["visual_obs%d" % i]
feed_dict[self.model.visual_in[i]] = _obs
if self.use_recurrent:
feed_dict[self.model.memory_in] = batch["memory"]
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = batch["prev_action"]
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
return value_estimates
def get_value_estimates(
self, next_obs: List[np.ndarray], agent_id: str, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param experience: AgentExperience to be used for bootstrapping.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
vec_vis_obs = split_obs(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations]
if self.use_recurrent:
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = self.retrieve_previous_action(
[agent_id]
)
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates

26
ml-agents/mlagents/trainers/ppo/trainer.py


"""
agent_id = trajectory.agent_id # All the agents should have the same ID
# Note that this agent buffer version of the traj. is one less than the len of the raw trajectory
# for bootstrapping purposes.
# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
# Update the normalization
if self.is_training:

)
for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
self.stats[self.policy.reward_signals[name].value_name].append(np.mean(v))
value_next = self.policy.get_value_estimates(
trajectory.next_obs,

# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.steps[-1].done:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.stats["Environment/Cumulative Reward"].append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats[self.policy.reward_signals[name].stat_name].append(
rewards.get(agent_id, 0)
)
rewards[agent_id] = 0
self._update_end_episode_stats(agent_id)
def is_ready_update(self):
"""

23
ml-agents/mlagents/trainers/rl_trainer.py


# of what reward signals are actually present.
self.collected_rewards = {"environment": defaultdict(lambda: 0)}
self.update_buffer = AgentBuffer()
self.episode_steps = {}
self.episode_steps = defaultdict(lambda: 0)
def end_episode(self) -> None:
"""

self.episode_steps[agent_id] = 0
for rewards in self.collected_rewards.values():
for agent_id in rewards:
rewards[agent_id] = 0
def _update_end_episode_stats(self, agent_id: str) -> None:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.stats["Environment/Cumulative Reward"].append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats[self.policy.reward_signals[name].stat_name].append(
rewards.get(agent_id, 0)
)
rewards[agent_id] = 0
def clear_update_buffer(self) -> None:

33
ml-agents/mlagents/trainers/sac/trainer.py


for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.episode_steps = {}
def save_model(self) -> None:
"""
Saves the model. Overrides the default save_model since we want to save

last_step = trajectory.steps[-1]
agent_id = trajectory.agent_id # All the agents should have the same ID
# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
agent_buffer_trajectory = trajectory_to_agentbuffer(trajectory)
# Update the normalization

evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
agent_buffer_trajectory["{}_rewards".format(name)].extend(evaluate_result)
# Get all value estimates for reporting purposes
value_estimates = self.policy.get_batched_value_estimates(
agent_buffer_trajectory
)
for name, v in value_estimates.items():
self.stats[self.policy.reward_signals[name].value_name].append(np.mean(v))
# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.max_step:

)
if trajectory.steps[-1].done:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.stats["Environment/Cumulative Reward"].append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats[self.policy.reward_signals[name].stat_name].append(
rewards.get(agent_id, 0)
)
rewards[agent_id] = 0
self._update_end_episode_stats(agent_id)
def is_ready_update(self) -> bool:
"""

62
ml-agents/mlagents/trainers/tf_policy.py


from tensorflow.python.platform import gfile
from tensorflow.python.framework import graph_util
from mlagents.trainers import tensorflow_to_barracuda as tf2bc
from mlagents.trainers.trajectory import split_obs
from mlagents.trainers.buffer import AgentBuffer
from mlagents.envs.brain import BrainInfo

self.model.update_normalization,
feed_dict={self.model.vector_in: vector_obs},
)
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: batch.num_experiences,
self.model.sequence_length: 1, # We want to feed data in batch-wise, not time-wise.
}
if self.use_vec_obs:
feed_dict[self.model.vector_in] = batch["vector_obs"]
if self.model.vis_obs_size > 0:
for i in range(len(self.model.visual_in)):
_obs = batch["visual_obs%d" % i]
feed_dict[self.model.visual_in[i]] = _obs
if self.use_recurrent:
feed_dict[self.model.memory_in] = batch["memory"]
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = batch["prev_action"]
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
return value_estimates
def get_value_estimates(
self, next_obs: List[np.ndarray], agent_id: str, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param experience: AgentExperience to be used for bootstrapping.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
vec_vis_obs = split_obs(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.model.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations]
if self.use_recurrent:
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = self.retrieve_previous_action(
[agent_id]
)
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
@property
def vis_obs_size(self):

正在加载...
取消
保存