浏览代码

Zeroed version of LSTM working for PPO

/develop/nopreviousactions
Ervin Teng 5 年前
当前提交
edeceefd
共有 6 个文件被更改,包括 61 次插入45 次删除
  1. 6
      ml-agents/mlagents/trainers/common/nn_policy.py
  2. 53
      ml-agents/mlagents/trainers/optimizer.py
  3. 30
      ml-agents/mlagents/trainers/ppo/optimizer.py
  4. 12
      ml-agents/mlagents/trainers/ppo/trainer.py
  5. 4
      ml-agents/mlagents/trainers/sac/trainer.py
  6. 1
      ml-agents/mlagents/trainers/tf_policy.py

6
ml-agents/mlagents/trainers/common/nn_policy.py


self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
self.memory_in[:, :_half_point],
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)

self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
self.memory_in[:, :_half_point],
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)

53
ml-agents/mlagents/trainers/optimizer.py


import abc
from typing import Dict, Any, List
from typing import Dict, Any, List, Tuple
import numpy as np
from mlagents.tf_utils.tf import tf

self.update_dict: Dict[str, tf.Tensor] = {}
self.value_heads: Dict[str, tf.Tensor] = {}
self.create_reward_signals(trainer_params["reward_signals"])
self.memory_in: tf.Tensor = None
self.memory_out: tf.Tensor = None
self.m_size: int = 0
def get_batched_value_estimates(self, batch: AgentBuffer) -> Dict[str, np.ndarray]:
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
self.policy.sequence_length_ph: 1, # We want to feed data in batch-wise, not time-wise.
self.policy.sequence_length_ph: batch.num_experiences, # We want to feed data in batch-wise, not time-wise.
}
if self.policy.vec_obs_size > 0:

_obs = batch["visual_obs%d" % i]
feed_dict[self.policy.visual_in[i]] = _obs
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = batch["memory"]
feed_dict[self.policy.memory_in] = [np.zeros((self.policy.m_size))]
feed_dict[self.memory_in] = [np.zeros((self.m_size))]
value_estimates = self.sess.run(self.value_heads, feed_dict)
if self.policy.use_recurrent:
value_estimates, policy_mem, value_mem = self.sess.run(
[self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
)
prev_action = batch["actions"][-1]
else:
value_estimates = self.sess.run(self.value_heads, feed_dict)
prev_action = None
policy_mem = None
value_mem = None
return value_estimates
# We do this in a separate step to feed the memory outs - a further optimization would
# be to append to the obs before running sess.run.
final_value_estimates = self.get_value_estimates(
next_obs, done, policy_mem, value_mem, prev_action
)
return value_estimates, final_value_estimates
self, next_obs: List[np.ndarray], agent_id: str, done: bool
self,
next_obs: List[np.ndarray],
done: bool,
policy_memory: np.ndarray = None,
value_memory: np.ndarray = None,
prev_action: np.ndarray = None,
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.

if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
# if self.policy.use_recurrent:
# feed_dict[self.policy.memory_in] = self.policy.retrieve_memories([agent_id])
# if self.policy.prev_action is not None:
# feed_dict[self.policy.prev_action] = self.policy.retrieve_previous_action(
# [agent_id]
# )
if policy_memory is not None:
feed_dict[self.policy.memory_in] = policy_memory
if value_memory is not None:
feed_dict[self.memory_in] = value_memory
if prev_action is not None:
feed_dict[self.policy.prev_action] = [prev_action]
value_estimates = self.sess.run(self.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}

30
ml-agents/mlagents/trainers/ppo/optimizer.py


"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
if num_layers < 1:
num_layers = 1

# Add some stuff to inference dict from optimizer
self.policy.inference_dict["learning_rate"] = self.learning_rate
if self.policy.use_recurrent:
self.policy.inference_dict["optimizer_memory_out"]: self.memory_out
def create_cc_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType

)[0]
if self.policy.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.policy.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.policy.m_size / 2)
self.memory_in[:, _half_point:],
self.policy.sequence_length,
self.memory_in,
self.policy.sequence_length_ph,
name="lstm_value",
)
self.memory_out = memory_value_out

)[0]
if self.policy.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.policy.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.policy.m_size / 2)
self.memory_in[:, _half_point:],
self.policy.sequence_length,
self.memory_in,
self.policy.sequence_length_ph,
name="lstm_value",
)
self.memory_out = memory_value_out

feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.policy.use_recurrent:
mem_in = [
mini_batch["memory"][i]
np.zeros((self.policy.m_size))
0, len(mini_batch["memory"]), self.policy.sequence_length
0, mini_batch.num_experiences, self.policy.sequence_length
feed_dict[self.memory_in] = mem_in
return feed_dict

12
ml-agents/mlagents/trainers/ppo/trainer.py


self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Get all value estimates
value_estimates = self.optimizer.get_batched_value_estimates(
agent_buffer_trajectory
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
trajectory.done_reached and not trajectory.max_step_reached,
)
for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)

value_next = self.optimizer.get_value_estimates(
trajectory.next_obs,
agent_id,
trajectory.done_reached and not trajectory.max_step_reached,
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(

4
ml-agents/mlagents/trainers/sac/trainer.py


self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
# Get all value estimates for reporting purposes
value_estimates = self.optimizer.get_batched_value_estimates(
agent_buffer_trajectory
value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
)
for name, v in value_estimates.items():
self.stats_reporter.add_stat(

1
ml-agents/mlagents/trainers/tf_policy.py


self.action_masks: Optional[tf.Tensor] = None
self.prev_action: Optional[tf.Tensor] = None
self.memory_in: Optional[tf.Tensor] = None
self.memory_out: Optional[tf.Tensor] = None
self.global_step, self.increment_step_op, self.steps_to_increment = (
LearningModel.create_global_steps()

正在加载...
取消
保存