浏览代码

More progress

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
cd74e51b
共有 7 个文件被更改,包括 179 次插入85 次删除
  1. 1
      ml-agents/mlagents/trainers/models.py
  2. 45
      ml-agents/mlagents/trainers/ppo/models.py
  3. 110
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 58
      ml-agents/mlagents/trainers/ppo/policy.py
  5. 14
      ml-agents/mlagents/trainers/ppo/trainer.py
  6. 2
      ml-agents/mlagents/trainers/rl_trainer.py
  7. 34
      ml-agents/mlagents/trainers/optimizer.py

1
ml-agents/mlagents/trainers/models.py


self.output: Optional[tf.Tensor] = None
self.selected_actions: Optional[tf.Tensor] = None
self.action_holder: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.global_step, self.increment_step, self.steps_to_increment = (
self.create_global_steps()

45
ml-agents/mlagents/trainers/ppo/models.py


if num_layers < 1:
num_layers = 1
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers, vis_encode_type)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
self.create_cc_actor(h_size, num_layers, vis_encode_type)
self.entropy = tf.ones_like(tf.reshape(self.entropy, [-1])) * self.entropy
self.learning_rate = self.create_learning_rate(
lr_schedule, lr, self.global_step, max_step
)
self.create_losses(
self.log_probs,
self.old_log_probs,
self.value_heads,
self.entropy,
beta,
epsilon,
lr,
max_step,
)
def create_cc_actor_critic(
def create_cc_actor(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""

"""
hidden_streams = LearningModel.create_observation_streams(
hidden_stream = LearningModel.create_observation_streams(
2,
1,
)
stream_scopes=["policy"],
)[0]
if self.use_recurrent:
self.memory_in = tf.placeholder(

hidden_policy, memory_policy_out = self.create_recurrent_encoder(
hidden_streams[0],
hidden_stream,
hidden_value, memory_value_out = self.create_recurrent_encoder(
hidden_streams[1],
self.memory_in[:, _half_point:],
self.sequence_length,
name="lstm_value",
)
self.memory_out = tf.concat(
[memory_policy_out, memory_value_out], axis=1, name="recurrent_out"
)
self.memory_out = memory_policy_out
hidden_policy = hidden_streams[0]
hidden_value = hidden_streams[1]
hidden_policy = hidden_stream
mu = tf.layers.dense(
hidden_policy,

self.entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + self.log_sigma_sq
)
self.create_value_heads(self.stream_names, hidden_value)
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.log_probs = tf.reduce_sum(

110
ml-agents/mlagents/trainers/ppo/optimizer.py


import logging
from typing import Optional
from typing import Optional, Dict, List, Any
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.optimizer import TFOptimizer
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
class PPOOptimizer(LearningModel):
class PPOOptimizer(TFOptimizer):
sess,
reward_signal_configs,
lr=1e-4,
lr_schedule=LearningRateSchedule.LINEAR,
h_size=128,

num_layers=2,
m_size=None,
seed=0,
stream_names=None,
vis_encode_type=EncoderType.SIMPLE,
):
"""

:param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
:return: a sub-class of PPOAgent tailored to the environment.
"""
LearningModel.__init__(
self, m_size, normalize, use_recurrent, brain, seed, stream_names
)
self.stream_names = self.reward_signals.keys()
super().__init__(self, sess, self.policy)
self.policy = policy
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
else:
self.create_dc_actor_critic(h_size, num_layers, vis_encode_type)

self.vector_in = self.policy.vector_in
self.visual_in = self.policy.visual_in
self.entropy,
self.policy.entropy,
beta,
epsilon,
lr,

h_size,
num_layers,
vis_encode_type,
)
stream_scopes=["optimizer"],
)[0]
if self.use_recurrent:
if self.policy.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)

hidden_stream,
self.memory_in[:, _half_point:],
self.sequence_length,
self.policy.sequence_length,
name="lstm_value",
)
self.memory_out = memory_value_out

self.create_value_heads(self.stream_names, hidden_value)
self.all_old_log_probs = tf.placeholder(
shape=[None, self.policy.act_size[0]], dtype=tf.float32, name="old_probabilities"
shape=[None, self.policy.act_size[0]],
dtype=tf.float32,
name="old_probabilities",
)
self.old_log_probs = tf.reduce_sum(

self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.grads = self.optimizer.compute_gradients(self.loss)
self.update_batch = self.optimizer.minimize(self.loss)
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size: batch.num_experiences,
self.policy.sequence_length: 1, # We want to feed data in batch-wise, not time-wise.
}
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = batch["vector_obs"]
if self.policy.vis_obs_size > 0:
for i in range(len(self.policy.visual_in)):
_obs = batch["visual_obs%d" % i]
feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = batch["memory"]
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = batch["prev_action"]
value_estimates = self.sess.run(self.value_heads, feed_dict)
value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
return value_estimates
def get_value_estimates(
self, next_obs: List[np.ndarray], agent_id: str, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param experience: AgentExperience to be used for bootstrapping.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size: 1,
self.policy.sequence_length: 1,
}
vec_vis_obs = SplitObservations.from_observations(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = self.retrieve_memories([agent_id])
if self.policy.prev_action is not None:
feed_dict[self.policy.prev_action] = self.retrieve_previous_action(
[agent_id]
)
value_estimates = self.sess.run(self.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.policy, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)

58
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.ppo.optimizer import PPOOptimizer
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
from mlagents.trainers.components.bc.module import BCModule
logger = logging.getLogger("mlagents.trainers")

self.create_model(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.create_reward_signals(reward_signal_configs)
with self.graph.as_default():
self.bc_module: Optional[BCModule] = None

trainer_params.get("vis_encode_type", "simple")
),
)
self.model.create_ppo_optimizer()
self.optimizer = PPOOptimizer(
brain=brain,
policy=self.model,
sess=self.sess,
reward_signal_configs=reward_signal_configs,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=False,
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
self.optimizer.create_ppo_optimizer()
self.inference_dict.update(
{

"learning_rate": self.model.learning_rate,
"learning_rate": self.optimizer.learning_rate,
self.inference_dict["memory_out"] = self.model.memory_out
self.inference_dict["policy_memory_out"] = self.model.memory_out
self.inference_dict["optimizer_memory_out"] = self.optimizer.memory_out
self.total_policy_loss = self.model.abs_policy_loss
self.total_policy_loss = self.optimizer.abs_policy_loss
"value_loss": self.model.value_loss,
"value_loss": self.optimizer.value_loss,
"update_batch": self.model.update_batch,
"update_batch": self.optimizer.update_batch,
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@timed
def evaluate(
self, batched_step_result: BatchedStepResult, global_agent_ids: List[str]

:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
feed_dict = self.construct_feed_dict(self.model, mini_batch, num_sequences)
feed_dict = self.construct_feed_dict(self.model, self.optimizer, mini_batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.

update_stats[stat_name] = update_vals[update_name]
return update_stats
def construct_feed_dict(self, model, mini_batch, num_sequences):
def construct_feed_dict(self, model, optimizer, mini_batch, num_sequences):
feed_dict = {
model.batch_size: num_sequences,
model.sequence_length: self.sequence_length,

14
ml-agents/mlagents/trainers/ppo/trainer.py


self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Get all value estimates
value_estimates = self.policy.get_batched_value_estimates(
value_estimates = self.policy.optimizer.get_batched_value_estimates(
self.policy.reward_signals[name].value_name, np.mean(v)
self.policy.optimizer.reward_signals[name].value_name, np.mean(v)
value_next = self.policy.get_value_estimates(
value_next = self.policy.optimizer.get_value_estimates(
trajectory.next_obs,
agent_id,
trajectory.done_reached and not trajectory.max_step_reached,

self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.policy.reward_signals.items():
for name, reward_signal in self.policy.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward

# Compute GAE and returns
tmp_advantages = []
tmp_returns = []
for name in self.policy.reward_signals:
for name in self.policy.optimizer.reward_signals:
bootstrap_value = value_next[name]
local_rewards = agent_buffer_trajectory[

rewards=local_rewards,
value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.policy.reward_signals[name].gamma,
gamma=self.policy.optimizer.reward_signals[name].gamma,
lambd=self.trainer_parameters["lambd"],
)
local_return = local_advantage + local_value_estimates

self.load,
)
for _reward_signal in policy.reward_signals.keys():
for _reward_signal in policy.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
return policy

2
ml-agents/mlagents/trainers/rl_trainer.py


rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
policy.optimizer.reward_signals[name].stat_name, rewards.get(agent_id, 0)
)
rewards[agent_id] = 0

34
ml-agents/mlagents/trainers/optimizer.py


import abc
from typing import Dict, Any
from tf_utils import tf
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.policy import Policy
from mlagents.trainers.tf_policy import TFPolicy
class Optimizer(abc.ABC):
"""
Creates loss functions and auxillary networks (e.g. Q or Value) needed for training.
Provides methods to update the Policy.
"""
def __init__(self, policy: Policy, optimizer_parameters: Dict[str, Any]):
"""
Create loss functions and auxillary networks.
"""
@abc.abstractmethod
def update_batch(self, batch: AgentBuffer):
"""
Update the Policy based on the batch that was passed in.
"""
class TFOptimizer(Optimizer):
def __init__(self, sess: tf.Session, policy: TFPolicy, reward_signal_configs):
self.sess = sess
self.policy = policy
self.update_dict: Dict[str, tf.Tensor] = {}
self.value_heads: Dict[str, tf.Tensor] = {}
self.create_reward_signals(reward_signal_configs)
正在加载...
取消
保存