浏览代码

Move add_experiences out of trainer, add Trajectories (#3067)

/asymm-envs
GitHub 5 年前
当前提交
2fd305e7
共有 32 个文件被更改,包括 1261 次插入837 次删除
  1. 3
      ml-agents/mlagents/trainers/action_info.py
  2. 198
      ml-agents/mlagents/trainers/agent_processor.py
  3. 29
      ml-agents/mlagents/trainers/buffer.py
  4. 1
      ml-agents/mlagents/trainers/components/bc/module.py
  5. 2
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  6. 13
      ml-agents/mlagents/trainers/curriculum.py
  7. 32
      ml-agents/mlagents/trainers/demo_loader.py
  8. 6
      ml-agents/mlagents/trainers/learn.py
  9. 26
      ml-agents/mlagents/trainers/models.py
  10. 45
      ml-agents/mlagents/trainers/ppo/policy.py
  11. 212
      ml-agents/mlagents/trainers/ppo/trainer.py
  12. 246
      ml-agents/mlagents/trainers/rl_trainer.py
  13. 2
      ml-agents/mlagents/trainers/sac/policy.py
  14. 140
      ml-agents/mlagents/trainers/sac/trainer.py
  15. 45
      ml-agents/mlagents/trainers/tests/mock_brain.py
  16. 95
      ml-agents/mlagents/trainers/tests/test_buffer.py
  17. 2
      ml-agents/mlagents/trainers/tests/test_meta_curriculum.py
  18. 174
      ml-agents/mlagents/trainers/tests/test_ppo.py
  19. 48
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  20. 38
      ml-agents/mlagents/trainers/tests/test_sac.py
  21. 3
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  22. 22
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  23. 4
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  24. 62
      ml-agents/mlagents/trainers/tf_policy.py
  25. 108
      ml-agents/mlagents/trainers/trainer.py
  26. 39
      ml-agents/mlagents/trainers/trainer_controller.py
  27. 4
      ml-agents/mlagents/trainers/trainer_util.py
  28. 118
      ml-agents/mlagents/trainers/stats.py
  29. 63
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  30. 80
      ml-agents/mlagents/trainers/tests/test_stats.py
  31. 110
      ml-agents/mlagents/trainers/tests/test_trajectory.py
  32. 128
      ml-agents/mlagents/trainers/trajectory.py

3
ml-agents/mlagents/trainers/action_info.py


from typing import NamedTuple, Any, Dict
import numpy as np
ActionInfoOutputs = Dict[str, Any]
ActionInfoOutputs = Dict[str, np.ndarray]
class ActionInfo(NamedTuple):

198
ml-agents/mlagents/trainers/agent_processor.py


from typing import List, Union
import sys
from typing import List, Dict
from collections import defaultdict, Counter
from mlagents.trainers.buffer import AgentBuffer, BufferException
from mlagents.trainers.trainer import Trainer
from mlagents.trainers.trajectory import Trajectory, AgentExperience
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.action_info import ActionInfoOutputs
from mlagents.trainers.stats import StatsReporter
class ProcessingBuffer(dict):
class AgentProcessor:
ProcessingBuffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.
AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id.
Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.
One AgentProcessor should be created per agent group.
def __str__(self):
return "local_buffers :\n{0}".format(
"\n".join(["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()])
)
def __getitem__(self, key):
if key not in self.keys():
self[key] = AgentBuffer()
return super().__getitem__(key)
def reset_local_buffers(self) -> None:
def __init__(
self,
trainer: Trainer,
policy: TFPolicy,
stats_reporter: StatsReporter,
max_trajectory_length: int = sys.maxsize,
):
Resets all the local AgentBuffers.
Create an AgentProcessor.
:param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
when it is finished.
:param policy: Policy instance associated with this AgentProcessor.
:param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
:param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
for buf in self.values():
buf.reset_agent()
self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
self.last_brain_info: Dict[str, BrainInfo] = {}
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
# Note: this is needed until we switch to AgentExperiences as the data input type.
# We still need some info from the policy (memories, previous actions)
# that really should be gathered by the env-manager.
self.policy = policy
self.episode_steps: Counter = Counter()
self.episode_rewards: Dict[str, float] = defaultdict(float)
self.stats_reporter = stats_reporter
self.trainer = trainer
self.max_trajectory_length = max_trajectory_length
def append_to_update_buffer(
def add_experiences(
update_buffer: AgentBuffer,
agent_id: Union[int, str],
key_list: List[str] = None,
batch_size: int = None,
training_length: int = None,
curr_info: BrainInfo,
next_info: BrainInfo,
take_action_outputs: ActionInfoOutputs,
Appends the buffer of an agent to the update buffer.
:param update_buffer: A reference to an AgentBuffer to append the agent's buffer to
:param agent_id: The id of the agent which data will be appended
:param key_list: The fields that must be added. If None: all fields will be appended.
:param batch_size: The number of elements that must be appended. If None: All of them will be.
:param training_length: The length of the samples that must be appended. If None: only takes one element.
Adds experiences to each agent's experience history.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.
if key_list is None:
key_list = self[agent_id].keys()
if not self[agent_id].check_length(key_list):
raise BufferException(
"The length of the fields {0} for agent {1} were not of same length".format(
key_list, agent_id
)
if take_action_outputs:
self.stats_reporter.add_stat(
"Policy/Entropy", take_action_outputs["entropy"].mean()
for field_key in key_list:
update_buffer[field_key].extend(
self[agent_id][field_key].get_batch(
batch_size=batch_size, training_length=training_length
)
self.stats_reporter.add_stat(
"Policy/Learning Rate", take_action_outputs["learning_rate"]
def append_all_agent_batch_to_update_buffer(
self,
update_buffer: AgentBuffer,
key_list: List[str] = None,
batch_size: int = None,
training_length: int = None,
) -> None:
"""
Appends the buffer of all agents to the update buffer.
:param key_list: The fields that must be added. If None: all fields will be appended.
:param batch_size: The number of elements that must be appended. If None: All of them will be.
:param training_length: The length of the samples that must be appended. If None: only takes one element.
"""
for agent_id in self.keys():
self.append_to_update_buffer(
update_buffer, agent_id, key_list, batch_size, training_length
)
for agent_id in curr_info.agents:
self.last_brain_info[agent_id] = curr_info
self.last_take_action_outputs[agent_id] = take_action_outputs
# Store the environment reward
tmp_environment_reward = next_info.rewards
for next_idx, agent_id in enumerate(next_info.agents):
stored_info = self.last_brain_info.get(agent_id, None)
if stored_info is not None:
stored_take_action_outputs = self.last_take_action_outputs[agent_id]
idx = stored_info.agents.index(agent_id)
obs = []
if not stored_info.local_done[idx]:
for i, _ in enumerate(stored_info.visual_observations):
obs.append(stored_info.visual_observations[i][idx])
if self.policy.use_vec_obs:
obs.append(stored_info.vector_observations[idx])
if self.policy.use_recurrent:
memory = self.policy.retrieve_memories([agent_id])[0, :]
else:
memory = None
done = next_info.local_done[next_idx]
max_step = next_info.max_reached[next_idx]
# Add the outputs of the last eval
action = stored_take_action_outputs["action"][idx]
if self.policy.use_continuous_act:
action_pre = stored_take_action_outputs["pre_action"][idx]
else:
action_pre = None
action_probs = stored_take_action_outputs["log_probs"][idx]
action_masks = stored_info.action_masks[idx]
prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
experience = AgentExperience(
obs=obs,
reward=tmp_environment_reward[next_idx],
done=done,
action=action,
action_probs=action_probs,
action_pre=action_pre,
action_mask=action_masks,
prev_action=prev_action,
max_step=max_step,
memory=memory,
)
# Add the value outputs if needed
self.experience_buffers[agent_id].append(experience)
self.episode_rewards[agent_id] += tmp_environment_reward[next_idx]
if (
next_info.local_done[next_idx]
or (
len(self.experience_buffers[agent_id])
>= self.max_trajectory_length
)
) and len(self.experience_buffers[agent_id]) > 0:
# Make next AgentExperience
next_obs = []
for i, _ in enumerate(next_info.visual_observations):
next_obs.append(next_info.visual_observations[i][next_idx])
if self.policy.use_vec_obs:
next_obs.append(next_info.vector_observations[next_idx])
trajectory = Trajectory(
steps=self.experience_buffers[agent_id],
agent_id=agent_id,
next_obs=next_obs,
)
# This will eventually be replaced with a queue
self.trainer.process_trajectory(trajectory)
self.experience_buffers[agent_id] = []
if next_info.local_done[next_idx]:
self.stats_reporter.add_stat(
"Environment/Cumulative Reward",
self.episode_rewards.get(agent_id, 0),
)
self.stats_reporter.add_stat(
"Environment/Episode Length",
self.episode_steps.get(agent_id, 0),
)
del self.episode_steps[agent_id]
del self.episode_rewards[agent_id]
elif not next_info.local_done[next_idx]:
self.episode_steps[agent_id] += 1
self.policy.save_previous_action(
curr_info.agents, take_action_outputs["action"]
)

29
ml-agents/mlagents/trainers/buffer.py


for _key in self.keys():
self[_key] = self[_key][current_length - max_length :]
def resequence_and_append(
self,
target_buffer: "AgentBuffer",
key_list: List[str] = None,
batch_size: int = None,
training_length: int = None,
) -> None:
"""
Takes in a batch size and training length (sequence length), and appends this AgentBuffer to target_buffer
properly padded for LSTM use. Optionally, use key_list to restrict which fields are inserted into the new
buffer.
:param target_buffer: The buffer which to append the samples to.
:param key_list: The fields that must be added. If None: all fields will be appended.
:param batch_size: The number of elements that must be appended. If None: All of them will be.
:param training_length: The length of the samples that must be appended. If None: only takes one element.
"""
if key_list is None:
key_list = list(self.keys())
if not self.check_length(key_list):
raise BufferException(
"The length of the fields {0} were not of same length".format(key_list)
)
for field_key in key_list:
target_buffer[field_key].extend(
self[field_key].get_batch(
batch_size=batch_size, training_length=training_length
)
)
@property
def num_experiences(self) -> int:
"""

1
ml-agents/mlagents/trainers/components/bc/module.py


feed_dict[self.policy.model.prev_action] = mini_batch_demo[
"prev_action"
]
network_out = self.policy.sess.run(
list(self.out_dict.values()), feed_dict=feed_dict
)

2
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


return RewardSignalResult(scaled_reward, unscaled_reward)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"])
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
return RewardSignalResult(self.strength * env_rews, env_rews)

13
ml-agents/mlagents/trainers/curriculum.py


import os
import json
import math
from typing import Dict, Any, TextIO
from .exception import CurriculumConfigError, CurriculumLoadingError

)
@property
def lesson_num(self):
def lesson_num(self) -> int:
def lesson_num(self, lesson_num):
def lesson_num(self, lesson_num: int) -> None:
def increment_lesson(self, measure_val):
def increment_lesson(self, measure_val: float) -> bool:
"""
Increments the lesson number depending on the progress given.
:param measure_val: Measure of progress (either reward or percentage

return True
return False
def get_config(self, lesson=None):
def get_config(self, lesson: int = None) -> Dict[str, Any]:
"""
Returns reset parameters which correspond to the lesson.
:param lesson: The lesson you want to get the config of. If None, the

return config
@staticmethod
def load_curriculum_file(location):
def load_curriculum_file(location: str) -> None:
try:
with open(location) as data_file:
return Curriculum._load_curriculum(data_file)

)
@staticmethod
def _load_curriculum(fp):
def _load_curriculum(fp: TextIO) -> None:
try:
return json.load(fp)
except json.decoder.JSONDecodeError as e:

32
ml-agents/mlagents/trainers/demo_loader.py


from typing import List, Tuple
import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
from mlagents.trainers.brain import BrainParameters, BrainInfo
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,

sequence_length: int,
) -> AgentBuffer:
# Create and populate buffer using experiences
demo_process_buffer = ProcessingBuffer()
demo_buffer = AgentBuffer()
demo_raw_buffer = AgentBuffer()
demo_processed_buffer = AgentBuffer()
for idx, experience in enumerate(pair_infos):
if idx > len(pair_infos) - 2:
break

previous_action = np.array(
pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
)
demo_process_buffer[0].last_brain_info = current_brain_info
demo_process_buffer[0]["done"].append(next_brain_info.local_done[0])
demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0])
demo_raw_buffer["done"].append(next_brain_info.local_done[0])
demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
demo_process_buffer[0]["visual_obs%d" % i].append(
demo_raw_buffer["visual_obs%d" % i].append(
demo_process_buffer[0]["vector_obs"].append(
demo_raw_buffer["vector_obs"].append(
demo_process_buffer[0]["actions"].append(
current_pair_info.action_info.vector_actions
)
demo_process_buffer[0]["prev_action"].append(previous_action)
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
demo_raw_buffer["prev_action"].append(previous_action)
demo_process_buffer.append_to_update_buffer(
demo_buffer, 0, batch_size=None, training_length=sequence_length
demo_raw_buffer.resequence_and_append(
demo_processed_buffer, batch_size=None, training_length=sequence_length
demo_process_buffer.reset_local_buffers()
demo_process_buffer.append_to_update_buffer(
demo_buffer, 0, batch_size=None, training_length=sequence_length
demo_raw_buffer.reset_agent()
demo_raw_buffer.resequence_and_append(
demo_processed_buffer, batch_size=None, training_length=sequence_length
return demo_buffer
return demo_processed_buffer
@timed

6
ml-agents/mlagents/trainers/learn.py


from mlagents.trainers.exception import TrainerError
from mlagents.trainers.meta_curriculum import MetaCurriculum
from mlagents.trainers.trainer_util import load_config, TrainerFactory
from mlagents.trainers.stats import TensorboardWriter, StatsReporter
from mlagents_envs.environment import UnityEnvironment
from mlagents.trainers.sampler_class import SamplerManager
from mlagents.trainers.exception import SamplerException

)
trainer_config = load_config(trainer_config_path)
port = options.base_port + (sub_id * options.num_envs)
# Configure Tensorboard Writers and StatsReporter
tb_writer = TensorboardWriter(summaries_dir)
StatsReporter.add_writer(tb_writer)
if options.env_path is None:
port = 5004 # This is the in Editor Training Port
env_factory = create_environment_factory(

26
ml-agents/mlagents/trainers/models.py


[],
trainable=False,
dtype=tf.int32,
initializer=tf.ones_initializer(),
initializer=tf.zeros_initializer(),
)
self.running_mean = tf.get_variable(
"running_mean",

self.update_normalization = self.create_normalizer_update(vector_obs)
def create_normalizer_update(self, vector_input):
mean_current_observation = tf.reduce_mean(vector_input, axis=0)
new_mean = self.running_mean + (
mean_current_observation - self.running_mean
) / tf.cast(tf.add(self.normalization_steps, 1), tf.float32)
new_variance = self.running_variance + (mean_current_observation - new_mean) * (
mean_current_observation - self.running_mean
# Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
# https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
steps_increment = tf.shape(vector_input)[0]
total_new_steps = tf.add(self.normalization_steps, steps_increment)
# Compute the incremental update and divide by the number of new steps.
input_to_old_mean = tf.subtract(vector_input, self.running_mean)
new_mean = self.running_mean + tf.reduce_sum(
input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0
)
# Compute difference of input to the new mean for Welford update
input_to_new_mean = tf.subtract(vector_input, new_mean)
new_variance = self.running_variance + tf.reduce_sum(
input_to_new_mean * input_to_old_mean, axis=0
update_norm_step = tf.assign(
self.normalization_steps, self.normalization_steps + 1
)
update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
return tf.group([update_mean, update_variance, update_norm_step])
@staticmethod

45
ml-agents/mlagents/trainers/ppo/policy.py


from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.brain import BrainInfo, BrainParameters
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy

{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}

]
feed_dict[model.memory_in] = mem_in
return feed_dict
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param brain_info: BrainInfo to be used for bootstrapping.
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [
brain_info.visual_observations[i][idx]
]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
agent_id = brain_info.agents[idx]
if self.use_recurrent:
feed_dict[self.model.memory_in] = self.retrieve_memories([agent_id])
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = self.retrieve_previous_action(
[agent_id]
)
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates

212
ml-agents/mlagents/trainers/ppo/trainer.py


import logging
from collections import defaultdict
from typing import Dict
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput
from mlagents.trainers.action_info import ActionInfoOutputs
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory
logger = logging.getLogger("mlagents.trainers")

self.policy = self.ppo_policy
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo
) -> None:
def process_trajectory(self, trajectory: Trajectory) -> None:
Checks agent histories for processing condition, and processes them as necessary.
Takes a trajectory and processes it, putting it into the update buffer.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
:param trajectory: The Trajectory tuple containing the steps to be processed.
if self.is_training:
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.processing_buffer[next_info.agents[l]]["actions"]
if (
next_info.local_done[l]
or len(agent_actions) > self.trainer_parameters["time_horizon"]
) and len(agent_actions) > 0:
agent_id = next_info.agents[l]
if next_info.max_reached[l]:
bootstrapping_info = self.processing_buffer[
agent_id
].last_brain_info
idx = bootstrapping_info.agents.index(agent_id)
else:
bootstrapping_info = next_info
idx = l
value_next = self.ppo_policy.get_value_estimates(
bootstrapping_info,
idx,
next_info.local_done[l] and not next_info.max_reached[l],
)
agent_id = trajectory.agent_id # All the agents should have the same ID
tmp_advantages = []
tmp_returns = []
for name in self.policy.reward_signals:
bootstrap_value = value_next[name]
# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
local_rewards = self.processing_buffer[agent_id][
"{}_rewards".format(name)
].get_batch()
local_value_estimates = self.processing_buffer[agent_id][
"{}_value_estimates".format(name)
].get_batch()
local_advantage = get_gae(
rewards=local_rewards,
value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.policy.reward_signals[name].gamma,
lambd=self.trainer_parameters["lambd"],
)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
self.processing_buffer[agent_id]["{}_returns".format(name)].set(
local_return
)
self.processing_buffer[agent_id]["{}_advantage".format(name)].set(
local_advantage
)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
global_returns = list(
np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)
)
self.processing_buffer[agent_id]["advantages"].set(global_advantages)
self.processing_buffer[agent_id]["discounted_returns"].set(
global_returns
)
# Get all value estimates
value_estimates = self.policy.get_batched_value_estimates(
agent_buffer_trajectory
)
for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
self.stats_reporter.add_stat(
self.policy.reward_signals[name].value_name, np.mean(v)
)
self.processing_buffer.append_to_update_buffer(
self.update_buffer,
agent_id,
batch_size=None,
training_length=self.policy.sequence_length,
)
value_next = self.policy.get_value_estimates(
trajectory.next_obs,
agent_id,
trajectory.done_reached and not trajectory.max_step_reached,
)
self.processing_buffer[agent_id].reset_agent()
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.stats["Environment/Cumulative Reward"].append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats[
self.policy.reward_signals[name].stat_name
].append(rewards.get(agent_id, 0))
rewards[agent_id] = 0
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.policy.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
agent_buffer_trajectory["{}_rewards".format(name)].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:
"""
Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.processing_buffer[agent_id]["actions_pre"].append(
actions_pre[agent_idx]
)
a_dist = take_action_outputs["log_probs"]
# value is a dictionary from name of reward to value estimate of the value head
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
self.processing_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])
# Compute GAE and returns
tmp_advantages = []
tmp_returns = []
for name in self.policy.reward_signals:
bootstrap_value = value_next[name]
def add_rewards_outputs(
self,
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,
) -> None:
"""
Takes the value output of the last action and store it into the training buffer.
"""
for name, reward_result in rewards_out.reward_signals.items():
# 0 because we use the scaled reward to train the agent
self.processing_buffer[agent_id]["{}_rewards".format(name)].append(
reward_result.scaled_reward[agent_next_idx]
local_rewards = agent_buffer_trajectory[
"{}_rewards".format(name)
].get_batch()
local_value_estimates = agent_buffer_trajectory[
"{}_value_estimates".format(name)
].get_batch()
local_advantage = get_gae(
rewards=local_rewards,
value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.policy.reward_signals[name].gamma,
lambd=self.trainer_parameters["lambd"],
self.processing_buffer[agent_id]["{}_value_estimates".format(name)].append(
values[name][agent_idx][0]
)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
agent_buffer_trajectory["{}_returns".format(name)].set(local_return)
agent_buffer_trajectory["{}_advantage".format(name)].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)
# Get global advantages
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length
)
# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(agent_id)
def is_ready_update(self):
"""

batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
self.stats_reporter.add_stat(stat, np.mean(stat_list))
self.stats[stat].append(val)
self.stats_reporter.add_stat(stat, val)
self.clear_update_buffer()
self.trainer_metrics.end_policy_update()

246
ml-agents/mlagents/trainers/rl_trainer.py


# # Unity ML-Agents Toolkit
import logging
from typing import Dict, List, Any, NamedTuple
import numpy as np
from typing import Dict
from collections import defaultdict
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.action_info import ActionInfoOutputs
from mlagents.trainers.agent_processor import ProcessingBuffer
from mlagents.trainers.trainer import Trainer, UnityTrainerException
from mlagents.trainers.components.reward_signals import RewardSignalResult

class AllRewardsOutput(NamedTuple):
"""
This class stores all of the outputs of the reward signals,
as well as the raw reward from the environment.
"""
reward_signals: RewardSignalResults
environment: np.ndarray
class RLTrainer(Trainer):
"""
This class is the base class for trainers that use Reward Signals.

# collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
# used for reporting only. We always want to report the environment reward to Tensorboard, regardless
# of what reward signals are actually present.
self.collected_rewards = {"environment": {}}
self.processing_buffer = ProcessingBuffer()
self.update_buffer = AgentBuffer()
self.episode_steps = {}
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo:
"""
Constructs a BrainInfo which contains the most recent previous experiences for all agents
which correspond to the agents in a provided next_info.
:BrainInfo next_info: A t+1 BrainInfo.
:return: curr_info: Reconstructed BrainInfo to match agents of next_info.
"""
visual_observations: List[List[Any]] = [
[] for _ in next_info.visual_observations
] # TODO add types to brain.py methods
vector_observations = []
rewards = []
local_dones = []
max_reacheds = []
agents = []
action_masks = []
for agent_id in next_info.agents:
agent_brain_info = self.processing_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
agent_index = agent_brain_info.agents.index(agent_id)
for i in range(len(next_info.visual_observations)):
visual_observations[i].append(
agent_brain_info.visual_observations[i][agent_index]
)
vector_observations.append(
agent_brain_info.vector_observations[agent_index]
)
rewards.append(agent_brain_info.rewards[agent_index])
local_dones.append(agent_brain_info.local_done[agent_index])
max_reacheds.append(agent_brain_info.max_reached[agent_index])
agents.append(agent_brain_info.agents[agent_index])
action_masks.append(agent_brain_info.action_masks[agent_index])
curr_info = BrainInfo(
visual_observations,
vector_observations,
rewards,
agents,
local_dones,
max_reacheds,
action_masks,
)
return curr_info
def add_experiences(
self,
curr_info: BrainInfo,
next_info: BrainInfo,
take_action_outputs: ActionInfoOutputs,
) -> None:
"""
Adds experiences to each agent's experience history.
:param curr_info: current BrainInfo.
:param next_info: next BrainInfo.
:param take_action_outputs: The outputs of the Policy's get_action method.
"""
self.trainer_metrics.start_experience_collection_timer()
if take_action_outputs:
self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean())
self.stats["Policy/Learning Rate"].append(
take_action_outputs["learning_rate"]
)
for name, signal in self.policy.reward_signals.items():
self.stats[signal.value_name].append(
np.mean(take_action_outputs["value_heads"][name])
)
for agent_id in curr_info.agents:
self.processing_buffer[agent_id].last_brain_info = curr_info
self.processing_buffer[
agent_id
].last_take_action_outputs = take_action_outputs
if curr_info.agents != next_info.agents:
curr_to_use = self.construct_curr_info(next_info)
else:
curr_to_use = curr_info
# Evaluate and store the reward signals
tmp_reward_signal_outs = {}
for name, signal in self.policy.reward_signals.items():
tmp_reward_signal_outs[name] = signal.evaluate(
curr_to_use, take_action_outputs["action"], next_info
)
# Store the environment reward
tmp_environment = np.array(next_info.rewards, dtype=np.float32)
rewards_out = AllRewardsOutput(
reward_signals=tmp_reward_signal_outs, environment=tmp_environment
)
for agent_id in next_info.agents:
stored_info = self.processing_buffer[agent_id].last_brain_info
stored_take_action_outputs = self.processing_buffer[
agent_id
].last_take_action_outputs
if stored_info is not None:
idx = stored_info.agents.index(agent_id)
next_idx = next_info.agents.index(agent_id)
if not stored_info.local_done[idx]:
for i, _ in enumerate(stored_info.visual_observations):
self.processing_buffer[agent_id]["visual_obs%d" % i].append(
stored_info.visual_observations[i][idx]
)
self.processing_buffer[agent_id][
"next_visual_obs%d" % i
].append(next_info.visual_observations[i][next_idx])
if self.policy.use_vec_obs:
self.processing_buffer[agent_id]["vector_obs"].append(
stored_info.vector_observations[idx]
)
self.processing_buffer[agent_id]["next_vector_in"].append(
next_info.vector_observations[next_idx]
)
if self.policy.use_recurrent:
self.processing_buffer[agent_id]["memory"].append(
self.policy.retrieve_memories([agent_id])[0, :]
)
self.processing_buffer[agent_id]["masks"].append(1.0)
self.processing_buffer[agent_id]["done"].append(
next_info.local_done[next_idx]
)
# Add the outputs of the last eval
self.add_policy_outputs(stored_take_action_outputs, agent_id, idx)
# Store action masks if necessary
if not self.policy.use_continuous_act:
self.processing_buffer[agent_id]["action_mask"].append(
stored_info.action_masks[idx], padding_value=1
)
self.processing_buffer[agent_id]["prev_action"].append(
self.policy.retrieve_previous_action([agent_id])[0, :]
)
values = stored_take_action_outputs["value_heads"]
# Add the value outputs if needed
self.add_rewards_outputs(
rewards_out, values, agent_id, idx, next_idx
)
for name, rewards in self.collected_rewards.items():
if agent_id not in rewards:
rewards[agent_id] = 0
if name == "environment":
# Report the reward from the environment
rewards[agent_id] += rewards_out.environment[next_idx]
else:
# Report the reward signals
rewards[agent_id] += rewards_out.reward_signals[
name
].scaled_reward[next_idx]
if not next_info.local_done[next_idx]:
if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0
self.episode_steps[agent_id] += 1
self.policy.save_previous_action(
curr_info.agents, take_action_outputs["action"]
)
self.trainer_metrics.end_experience_collection_timer()
self.collected_rewards: Dict[str, Dict[str, int]] = {
"environment": defaultdict(lambda: 0)
}
self.update_buffer: AgentBuffer = AgentBuffer()
self.episode_steps: Dict[str, int] = defaultdict(lambda: 0)
def end_episode(self) -> None:
"""

self.processing_buffer.reset_local_buffers()
for agent_id in self.episode_steps:
self.episode_steps[agent_id] = 0
for rewards in self.collected_rewards.values():

def _update_end_episode_stats(self, agent_id: str) -> None:
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats_reporter.add_stat(
self.policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
)
rewards[agent_id] = 0
def clear_update_buffer(self) -> None:
"""
Clear the buffers that have been built up during inference. If

def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:
"""
Takes the output of the last action and store it into the training buffer.
We break this out from add_experiences since it is very highly dependent
on the type of trainer.
:param take_action_outputs: The outputs of the Policy's get_action method.
:param agent_id: the Agent we're adding to.
:param agent_idx: the index of the Agent agent_id
"""
raise UnityTrainerException(
"The add_policy_outputs method was not implemented."
)
def add_rewards_outputs(
self,
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,
) -> None:
"""
Takes the value and evaluated rewards output of the last action and store it
into the training buffer. We break this out from add_experiences since it is very
highly dependent on the type of trainer.
:param take_action_outputs: The outputs of the Policy's get_action method.
:param rewards_dict: Dict of rewards after evaluation
:param agent_id: the Agent we're adding to.
:param agent_idx: the index of the Agent agent_id in the current brain info
:param agent_next_idx: the index of the Agent agent_id in the next brain info
"""
raise UnityTrainerException(
"The add_rewards_outputs method was not implemented."
)
def advance(self):
"""
Eventually logic from TrainerController.advance() will live here.
"""
self.clear_update_buffer()

2
ml-agents/mlagents/trainers/sac/policy.py


{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}

140
ml-agents/mlagents/trainers/sac/trainer.py


import numpy as np
from mlagents.trainers.brain import BrainInfo
from mlagents.trainers.action_info import ActionInfoOutputs
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
LOGGER = logging.getLogger("mlagents.trainers")

)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.episode_steps = {}
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
def save_model(self) -> None:
"""

)
)
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:
def process_trajectory(self, trajectory: Trajectory) -> None:
Takes the output of the last action and store it into the training buffer.
Takes a trajectory and processes it, putting it into the replay buffer.
actions = take_action_outputs["action"]
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
last_step = trajectory.steps[-1]
agent_id = trajectory.agent_id # All the agents should have the same ID
def add_rewards_outputs(
self,
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,
) -> None:
"""
Takes the value output of the last action and store it into the training buffer.
"""
self.processing_buffer[agent_id]["environment_rewards"].append(
rewards_out.environment[agent_next_idx]
)
# Add to episode_steps
self.episode_steps[agent_id] += len(trajectory.steps)
def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo
) -> None:
"""
Checks agent histories for processing condition, and processes them as necessary.
:param current_info: current BrainInfo.
:param next_info: next BrainInfo.
"""
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
self.policy.update_normalization(next_info.vector_observations)
for l in range(len(next_info.agents)):
agent_actions = self.processing_buffer[next_info.agents[l]]["actions"]
if (
next_info.local_done[l]
or len(agent_actions) >= self.trainer_parameters["time_horizon"]
) and len(agent_actions) > 0:
agent_id = next_info.agents[l]
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Bootstrap using last brain info. Set last element to duplicate obs and remove dones.
if next_info.max_reached[l]:
bootstrapping_info = self.processing_buffer[
agent_id
].last_brain_info
idx = bootstrapping_info.agents.index(agent_id)
for i, obs in enumerate(bootstrapping_info.visual_observations):
self.processing_buffer[agent_id]["next_visual_obs%d" % i][
-1
] = obs[idx]
if self.policy.use_vec_obs:
self.processing_buffer[agent_id]["next_vector_in"][
-1
] = bootstrapping_info.vector_observations[idx]
self.processing_buffer[agent_id]["done"][-1] = False
# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.policy.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
self.processing_buffer.append_to_update_buffer(
self.update_buffer,
agent_id,
batch_size=None,
training_length=self.policy.sequence_length,
)
# Get all value estimates for reporting purposes
value_estimates = self.policy.get_batched_value_estimates(
agent_buffer_trajectory
)
for name, v in value_estimates.items():
self.stats_reporter.add_stat(
self.policy.reward_signals[name].value_name, np.mean(v)
)
self.processing_buffer[agent_id].reset_agent()
if next_info.local_done[l]:
self.stats["Environment/Episode Length"].append(
self.episode_steps.get(agent_id, 0)
)
self.episode_steps[agent_id] = 0
for name, rewards in self.collected_rewards.items():
if name == "environment":
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)
)
self.stats["Environment/Cumulative Reward"].append(
rewards.get(agent_id, 0)
)
self.reward_buffer.appendleft(rewards.get(agent_id, 0))
rewards[agent_id] = 0
else:
self.stats[
self.policy.reward_signals[name].stat_name
].append(rewards.get(agent_id, 0))
rewards[agent_id] = 0
# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
if last_step.max_step:
vec_vis_obs = SplitObservations.from_observations(last_step.obs)
for i, obs in enumerate(vec_vis_obs.visual_observations):
agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
if vec_vis_obs.vector_observations.size > 1:
agent_buffer_trajectory["next_vector_in"][
-1
] = vec_vis_obs.vector_observations
agent_buffer_trajectory["done"][-1] = False
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length
)
if trajectory.done_reached:
self._update_end_episode_stats(agent_id)
def is_ready_update(self) -> bool:
"""

)
for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
self.stats_reporter.add_stat(stat, np.mean(stat_list))
self.stats[stat].append(val)
self.stats_reporter.add_stat(stat, val)
def update_reward_signals(self) -> None:
"""

for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
self.stats_reporter.add_stat(stat, np.mean(stat_list))

45
ml-agents/mlagents/trainers/tests/mock_brain.py


from mlagents.trainers.brain import CameraResolution, BrainParameters
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
def create_mock_brainparams(

def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
buffer = ProcessingBuffer()
buffer = AgentBuffer()
update_buffer = AgentBuffer()
# Make a buffer
for idx, experience in enumerate(brain_infos):

next_brain_info = brain_infos[idx + 1]
buffer[0].last_brain_info = current_brain_info
buffer[0]["done"].append(next_brain_info.local_done[0])
buffer[0]["rewards"].append(next_brain_info.rewards[0])
buffer.last_brain_info = current_brain_info
buffer["done"].append(next_brain_info.local_done[0])
buffer["rewards"].append(next_brain_info.rewards[0])
buffer[0]["visual_obs%d" % i].append(
buffer["visual_obs%d" % i].append(
buffer[0]["next_visual_obs%d" % i].append(
buffer["next_visual_obs%d" % i].append(
buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0])
buffer[0]["next_vector_in"].append(
current_brain_info.vector_observations[0]
)
buffer["vector_obs"].append(current_brain_info.vector_observations[0])
buffer["next_vector_in"].append(current_brain_info.vector_observations[0])
buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer[0]["masks"].append(1.0)
buffer[0]["advantages"].append(1.0)
buffer["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer["masks"].append(1.0)
buffer["advantages"].append(1.0)
buffer[0]["action_probs"].append(
buffer["action_probs"].append(
buffer[0]["action_probs"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
buffer["action_probs"].append(
np.ones(buffer["actions"][0].shape, dtype=np.float32)
buffer[0]["actions_pre"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
buffer["actions_pre"].append(
np.ones(buffer["actions"][0].shape, dtype=np.float32)
buffer[0]["action_mask"].append(
buffer["action_mask"].append(
buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32))
buffer["memory"].append(np.ones(memory_size, dtype=np.float32))
buffer.append_to_update_buffer(
update_buffer, 0, batch_size=None, training_length=sequence_length
buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=sequence_length
)
return update_buffer

95
ml-agents/mlagents/trainers/tests/test_buffer.py


import numpy as np
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.agent_processor import ProcessingBuffer
def assert_array(a, b):

assert la[i] == lb[i]
def construct_fake_processing_buffer():
b = ProcessingBuffer()
for fake_agent_id in range(4):
for step in range(9):
b[fake_agent_id]["vector_observation"].append(
[
100 * fake_agent_id + 10 * step + 1,
100 * fake_agent_id + 10 * step + 2,
100 * fake_agent_id + 10 * step + 3,
]
)
b[fake_agent_id]["action"].append(
[
100 * fake_agent_id + 10 * step + 4,
100 * fake_agent_id + 10 * step + 5,
]
)
def construct_fake_buffer(fake_agent_id):
b = AgentBuffer()
for step in range(9):
b["vector_observation"].append(
[
100 * fake_agent_id + 10 * step + 1,
100 * fake_agent_id + 10 * step + 2,
100 * fake_agent_id + 10 * step + 3,
]
)
b["action"].append(
[100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5]
)
b = construct_fake_processing_buffer()
a = b[1]["vector_observation"].get_batch(
agent_1_buffer = construct_fake_buffer(1)
agent_2_buffer = construct_fake_buffer(2)
agent_3_buffer = construct_fake_buffer(3)
a = agent_1_buffer["vector_observation"].get_batch(
a = b[2]["vector_observation"].get_batch(
a = agent_2_buffer["vector_observation"].get_batch(
batch_size=2, training_length=3, sequential=True
)
assert_array(

]
),
)
a = b[2]["vector_observation"].get_batch(
a = agent_2_buffer["vector_observation"].get_batch(
batch_size=2, training_length=3, sequential=False
)
assert_array(

]
),
)
b[4].reset_agent()
assert len(b[4]) == 0
agent_1_buffer.reset_agent()
assert agent_1_buffer.num_experiences == 0
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
agent_2_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2
)
agent_3_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2
)
assert len(update_buffer["action"]) == 20
assert np.array(update_buffer["action"]).shape == (20, 2)

def test_buffer_sample():
b = construct_fake_processing_buffer()
agent_1_buffer = construct_fake_buffer(1)
agent_2_buffer = construct_fake_buffer(2)
b.append_to_update_buffer(update_buffer, 3, batch_size=None, training_length=2)
b.append_to_update_buffer(update_buffer, 2, batch_size=None, training_length=2)
agent_1_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2
)
agent_2_buffer.resequence_and_append(
update_buffer, batch_size=None, training_length=2
)
# Test non-LSTM
mb = update_buffer.sample_mini_batch(batch_size=4, sequence_length=1)
assert mb.keys() == update_buffer.keys()