浏览代码

Merge latest develop

/hotfix-v0.9.2a
Ervin Teng 5 年前
当前提交
072d2ef8
共有 36 个文件被更改,包括 2077 次插入1345 次删除
  1. 1
      ml-agents-envs/mlagents/envs/subprocess_env_manager.py
  2. 7
      ml-agents-envs/mlagents/envs/tests/test_timers.py
  3. 63
      ml-agents-envs/mlagents/envs/timers.py
  4. 2
      ml-agents/mlagents/trainers/bc/models.py
  5. 11
      ml-agents/mlagents/trainers/bc/offline_trainer.py
  6. 11
      ml-agents/mlagents/trainers/bc/online_trainer.py
  7. 31
      ml-agents/mlagents/trainers/bc/trainer.py
  8. 171
      ml-agents/mlagents/trainers/buffer.py
  9. 38
      ml-agents/mlagents/trainers/components/bc/module.py
  10. 68
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  11. 5
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  12. 6
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  13. 62
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  14. 55
      ml-agents/mlagents/trainers/learn.py
  15. 258
      ml-agents/mlagents/trainers/models.py
  16. 241
      ml-agents/mlagents/trainers/ppo/models.py
  17. 144
      ml-agents/mlagents/trainers/ppo/policy.py
  18. 334
      ml-agents/mlagents/trainers/ppo/trainer.py
  19. 14
      ml-agents/mlagents/trainers/tests/mock_brain.py
  20. 71
      ml-agents/mlagents/trainers/tests/test_buffer.py
  21. 9
      ml-agents/mlagents/trainers/tests/test_learn.py
  22. 2
      ml-agents/mlagents/trainers/tests/test_ppo.py
  23. 15
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  24. 289
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  25. 5
      ml-agents/mlagents/trainers/tf_policy.py
  26. 173
      ml-agents/mlagents/trainers/trainer.py
  27. 113
      ml-agents/mlagents/trainers/trainer_controller.py
  28. 1
      ml-agents/setup.py
  29. 140
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  30. 253
      ml-agents/mlagents/trainers/rl_trainer.py
  31. 129
      ml-agents/mlagents/trainers/tests/test_multigpu.py
  32. 81
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  33. 207
      ml-agents/mlagents/trainers/tests/test_simple_rl.py
  34. 315
      ml-agents/mlagents/trainers/tests/test_trainer_util.py
  35. 97
      ml-agents/mlagents/trainers/trainer_util.py

1
ml-agents-envs/mlagents/envs/subprocess_env_manager.py


# So after we send back the root timer, we can safely clear them.
# Note that we could randomly return timers a fraction of the time if we wanted to reduce
# the data transferred.
# TODO get gauges from the workers and merge them in the main process too.
step_response = StepResponse(all_brain_info, get_timer_root())
step_queue.put(EnvironmentResponse("step", worker_id, step_response))
reset_timers()

7
ml-agents-envs/mlagents/envs/tests/test_timers.py


@timers.timed
def decorated_func(x: int = 0, y: float = 1.0) -> str:
timers.set_gauge("my_gauge", x + y)
return f"{x} + {y} = {x + y}"

with timers.hierarchical_timer("top_level"):
for i in range(3):
with timers.hierarchical_timer("multiple"):
decorated_func()
decorated_func(i, i)
raised = False
try:

],
}
],
"gauges": [
{"name": "my_gauge", "value": 4.0, "max": 4.0, "min": 0.0, "count": 3}
],
assert timer_tree == expected_tree

63
ml-agents-envs/mlagents/envs/timers.py


# # Unity ML-Agents Toolkit
import math
from typing import Any, Callable, Dict, Generator, TypeVar
from typing import Any, Callable, Dict, Generator, List, TypeVar
"""
Lightweight, hierarchical timers for profiling sections of code.

child.merge(other_child_node, is_parallel=is_parallel)
class GaugeNode:
"""
Tracks the most recent value of a metric. This is analogous to gauges in statsd.
"""
__slots__ = ["value", "min_value", "max_value", "count"]
def __init__(self, value: float):
self.value = value
self.min_value = value
self.max_value = value
self.count = 1
def update(self, new_value: float):
self.min_value = min(self.min_value, new_value)
self.max_value = max(self.max_value, new_value)
self.value = new_value
self.count += 1
def as_dict(self) -> Dict[str, float]:
return {
"value": self.value,
"min": self.min_value,
"max": self.max_value,
"count": self.count,
}
class TimerStack:
"""
Tracks all the time spent. Users shouldn't use this directly, they should use the contextmanager below to make

__slots__ = ["root", "stack", "start_time"]
__slots__ = ["root", "stack", "start_time", "gauges"]
self.gauges: Dict[str, GaugeNode] = {}
self.gauges: Dict[str, GaugeNode] = {}
def push(self, name: str) -> TimerNode:
"""

node = self.get_root()
res["name"] = "root"
# Only output gauges at top level
if self.gauges:
res["gauges"] = self._get_gauges()
res["total"] = node.total
res["count"] = node.count

return res
def set_gauge(self, name: str, value: float) -> None:
if math.isnan(value):
return
gauge_node = self.gauges.get(name)
if gauge_node:
gauge_node.update(value)
else:
self.gauges[name] = GaugeNode(value)
def _get_gauges(self) -> List[Dict[str, Any]]:
gauges = []
for gauge_name, gauge_node in self.gauges.items():
gauge_dict: Dict[str, Any] = {"name": gauge_name, **gauge_node.as_dict()}
gauges.append(gauge_dict)
return gauges
# Global instance of a TimerStack. This is generally all that we need for profiling, but you can potentially
# create multiple instances and pass them to the contextmanager

return func(*args, **kwargs)
return wrapped # type: ignore
def set_gauge(name: str, value: float, timer_stack: TimerStack = None) -> None:
"""
Updates the value of the gauge (or creates it if it hasn't been set before).
"""
timer_stack = timer_stack or _global_timer_stack
timer_stack.set_gauge(name, value)
def get_timer_tree(timer_stack: TimerStack = None) -> Dict[str, Any]:

2
ml-agents/mlagents/trainers/bc/models.py


self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
self.sample_action_float, normalized_logits = self.create_discrete_action_masking_layer(
self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer(
tf.concat(policy_branches, axis=1), self.action_masks, self.act_size
)
tf.identity(normalized_logits, name="action")

11
ml-agents/mlagents/trainers/bc/offline_trainer.py


"The provided demonstration is not compatible with the "
"brain being used for performance evaluation."
)
def __str__(self):
return """Hyperparameters for the Imitation Trainer of brain {0}: \n{1}""".format(
self.brain_name,
"\n".join(
[
"\t{0}:\t{1}".format(x, self.trainer_parameters[x])
for x in self.param_keys
]
),
)

11
ml-agents/mlagents/trainers/bc/online_trainer.py


int(trainer_parameters["batch_size"] / self.policy.sequence_length), 1
)
def __str__(self):
return """Hyperparameters for the Imitation Trainer of brain {0}: \n{1}""".format(
self.brain_name,
"\n".join(
[
"\t{0}:\t{1}".format(x, self.trainer_parameters[x])
for x in self.param_keys
]
),
)
def add_experiences(
self,
curr_info: AllBrainInfo,

31
ml-agents/mlagents/trainers/bc/trainer.py


self.demonstration_buffer = Buffer()
self.evaluation_buffer = Buffer()
@property
def parameters(self):
"""
Returns the trainer parameters of the trainer.
"""
return self.trainer_parameters
@property
def get_max_steps(self):
"""
Returns the maximum number of steps. Is used to know when the trainer should be stopped.
:return: The maximum number of steps of the trainer
"""
return float(self.trainer_parameters["max_steps"])
@property
def get_step(self):
"""
Returns the number of steps the trainer has performed
:return: the step count of the trainer
"""
return self.policy.get_current_step()
def increment_step(self, n_steps: int) -> None:
"""
Increment the step count of the trainer
:param n_steps: number of steps to increment the step count by
"""
self.step = self.policy.increment_step(n_steps)
def add_experiences(
self,
curr_info: AllBrainInfo,

171
ml-agents/mlagents/trainers/buffer.py


import random
from collections import defaultdict
import h5py
from mlagents.envs.exception import UnityException

sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
"""
if training_length == 1:
# When the training length is 1, the method returns a list of elements,
# not a list of sequences of elements.
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
# If batch_size is None : All the elements of the AgentBufferField are returned.
return np.array(self)
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (
len(self) // training_length + 1 * (leftover != 0)
):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1]) * self.padding_value
return np.array(
[padding] * (training_length - leftover) + self[:]
)
# return the batch_size last elements
if batch_size > len(self):
raise BufferException("Batch size requested is too large")
return np.array(self[-batch_size:])
return np.array(
self[len(self) - batch_size * training_length :]
)
# The training_length is not None, the method returns a list of SEQUENCES of elements
if not sequential:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += [np.array(self[end - training_length : end])]
return np.array(tmp_list)
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (
leftover != 0
)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (
len(self) // training_length + 1 * (leftover != 0)
):
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list = []
padding = np.array(self[-1]) * self.padding_value
# The padding is made with zeros and its shape is given by the shape of the last element
for end in range(
len(self), len(self) % training_length, -training_length
)[:batch_size]:
tmp_list += [np.array(self[end - training_length : end])]
if (leftover != 0) and (len(tmp_list) < batch_size):
tmp_list += [
np.array(
[padding] * (training_length - leftover)
+ self[:leftover]
)
]
tmp_list.reverse()
return np.array(tmp_list)
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException(
"The batch size and training length requested for get_batch where"
" too large given the current number of data points."
)
tmp_list = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += self[end - training_length : end]
return np.array(tmp_list)
def reset_field(self):
"""

length = len(self[key])
return True
def shuffle(self, key_list=None):
def shuffle(self, sequence_length, key_list=None):
Shuffles the fields in key_list in a consistent way: The reordering will
Shuffles the fields in key_list in a consistent way: The reordering will
be the same across fields.
:param key_list: The fields that must be shuffled.

raise BufferException(
"Unable to shuffle if the fields are not of same length"
)
s = np.arange(len(self[key_list[0]]))
s = np.arange(len(self[key_list[0]]) // sequence_length)
self[key][:] = [self[key][i] for i in s]
tmp = []
for i in s:
tmp += self[key][i * sequence_length : (i + 1) * sequence_length]
self[key][:] = tmp
def make_mini_batch(self, start, end):
"""

"""
mini_batch = {}
for key in self:
mini_batch[key] = np.array(self[key][start:end])
mini_batch[key] = self[key][start:end]
return mini_batch
def sample_mini_batch(self, batch_size, sequence_length=1):
"""
Creates a mini-batch from a random start and end.
:param batch_size: number of elements to withdraw.
:param sequence_length: Length of sequences to sample.
Number of sequences to sample will be batch_size/sequence_length.
"""
num_seq_to_sample = batch_size // sequence_length
mini_batch = Buffer.AgentBuffer()
buff_len = len(next(iter(self.values())))
num_sequences_in_buffer = buff_len // sequence_length
start_idxes = [
random.randint(0, num_sequences_in_buffer - 1) * sequence_length
for _ in range(num_seq_to_sample)
] # Sample random sequence starts
for i in start_idxes:
for key in self:
mini_batch[key].extend(self[key][i : i + sequence_length])
def save_to_file(self, file_object):
"""
Saves the AgentBuffer to a file-like object.
"""
with h5py.File(file_object) as write_file:
for key, data in self.items():
write_file.create_dataset(
key, data=data, dtype="f", compression="gzip"
)
def load_from_file(self, file_object):
"""
Loads the AgentBuffer from a file-like object.
"""
with h5py.File(file_object) as read_file:
for key in list(read_file.keys()):
self[key] = Buffer.AgentBuffer.AgentBufferField()
# extend() will convert the numpy array's first dimension into list
self[key].extend(read_file[key][()])
def __init__(self):
self.update_buffer = self.AgentBuffer()
super(Buffer, self).__init__()

Resets the update buffer
"""
self.update_buffer.reset_agent()
def truncate_update_buffer(self, max_length, sequence_length=1):
"""
Truncates the update buffer to a certain length.
This can be slow for large buffers. We compensate by cutting further than we need to, so that
we're not truncating at each update. Note that we must truncate an integer number of sequence_lengths
param: max_length: The length at which to truncate the buffer.
"""
current_length = len(next(iter(self.update_buffer.values())))
# make max_length an integer number of sequence_lengths
max_length -= max_length % sequence_length
if current_length > max_length:
for _key in self.update_buffer.keys():
self.update_buffer[_key] = self.update_buffer[_key][
current_length - max_length :
]
def reset_local_buffers(self):
"""

38
ml-agents/mlagents/trainers/components/bc/module.py


n_epoch = self.num_epoch
for _ in range(n_epoch):
self.demonstration_buffer.update_buffer.shuffle()
self.demonstration_buffer.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
for i in range(num_batches):
for i in range(num_batches // self.policy.sequence_length):
start = i * self.n_sequences
end = (i + 1) * self.n_sequences
start = i * self.n_sequences * self.policy.sequence_length
end = (i + 1) * self.n_sequences * self.policy.sequence_length
mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
run_out = self._update_batch(mini_batch_demo, self.n_sequences)
loss = run_out["loss"]

self.policy.model.batch_size: n_sequences,
self.policy.model.sequence_length: self.policy.sequence_length,
}
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"]
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
[-1, self.policy.model.brain.vector_action_space_size[0]]
)
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
[-1, len(self.policy.model.brain.vector_action_space_size)]
)
feed_dict[self.policy.model.action_masks] = np.ones(
(
self.n_sequences,

if self.policy.model.brain.vector_observation_space_size > 0:
apparent_obs_size = (
self.policy.model.brain.vector_observation_space_size
* self.policy.model.brain.num_stacked_vector_observations
)
feed_dict[self.policy.model.vector_in] = mini_batch_demo[
"vector_obs"
].reshape([-1, apparent_obs_size])
feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]
visual_obs = mini_batch_demo["visual_obs%d" % i]
if self.policy.sequence_length > 1 and self.policy.use_recurrent:
(_batch, _seq, _w, _h, _c) = visual_obs.shape
feed_dict[self.policy.model.visual_in[i]] = visual_obs.reshape(
[-1, _w, _h, _c]
)
else:
feed_dict[self.policy.model.visual_in[i]] = visual_obs
feed_dict[self.policy.model.visual_in[i]] = mini_batch_demo[
"visual_obs%d" % i
]
if self.use_recurrent:
feed_dict[self.policy.model.memory_in] = np.zeros(
[self.n_sequences, self.policy.m_size]

"prev_action"
].reshape([-1, len(self.policy.model.act_size)])
]
network_out = self.policy.sess.run(
list(self.out_dict.values()), feed_dict=feed_dict

68
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.policy.use_vec_obs:
feed_dict[self.model.next_vector_in] = next_info.vector_observations
if self.policy.use_recurrent:
if current_info.memories.shape[1] == 0:
current_info.memories = self.policy.make_empty_memory(
len(current_info.agents)
)
feed_dict[self.policy.model.memory_in] = current_info.memories
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

forward_total: List[float] = []
inverse_total: List[float] = []
for _ in range(self.num_epoch):
update_buffer.shuffle()
update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = update_buffer
for l in range(len(update_buffer["actions"]) // num_sequences):
start = l * num_sequences

feed_dict = {
self.policy.model.batch_size: num_sequences,
self.policy.model.sequence_length: self.policy.sequence_length,
self.policy.model.mask_input: mini_batch["masks"].flatten(),
self.policy.model.advantage: mini_batch["advantages"].reshape([-1, 1]),
self.policy.model.all_old_log_probs: mini_batch["action_probs"].reshape(
[-1, sum(self.policy.model.act_size)]
),
self.policy.model.mask_input: mini_batch["masks"],
self.policy.model.advantage: mini_batch["advantages"],
self.policy.model.all_old_log_probs: mini_batch["action_probs"],
feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"].reshape(
[-1, self.policy.model.act_size[0]]
)
feed_dict[self.policy.model.epsilon] = mini_batch[
"random_normal_epsilon"
].reshape([-1, self.policy.model.act_size[0]])
feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
feed_dict[self.policy.model.action_holder] = mini_batch["actions"].reshape(
[-1, len(self.policy.model.act_size)]
)
if self.policy.use_recurrent:
feed_dict[self.policy.model.prev_action] = mini_batch[
"prev_action"
].reshape([-1, len(self.policy.model.act_size)])
feed_dict[self.policy.model.action_masks] = mini_batch[
"action_mask"
].reshape([-1, sum(self.policy.brain.vector_action_space_size)])
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"].reshape(
[-1, self.policy.vec_obs_size]
)
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"].reshape(
[-1, self.policy.vec_obs_size]
)
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
_obs = mini_batch["visual_obs%d" % i]
if self.policy.sequence_length > 1 and self.policy.use_recurrent:
(_batch, _seq, _w, _h, _c) = _obs.shape
feed_dict[self.policy.model.visual_in[i]] = _obs.reshape(
[-1, _w, _h, _c]
)
else:
feed_dict[self.policy.model.visual_in[i]] = _obs
feed_dict[self.policy.model.visual_in[i]] = mini_batch[
"visual_obs%d" % i
]
_obs = mini_batch["next_visual_obs%d" % i]
if self.policy.sequence_length > 1 and self.policy.use_recurrent:
(_batch, _seq, _w, _h, _c) = _obs.shape
feed_dict[self.model.next_visual_in[i]] = _obs.reshape(
[-1, _w, _h, _c]
)
else:
feed_dict[self.model.next_visual_in[i]] = _obs
if self.policy.use_recurrent:
mem_in = mini_batch["memory"][:, 0, :]
feed_dict[self.policy.model.memory_in] = mem_in
feed_dict[self.model.next_visual_in[i]] = mini_batch[
"next_visual_obs%d" % i
]
self.has_updated = True
run_out = self.policy._execute_model(feed_dict, self.update_dict)
return run_out

5
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
env_rews = mini_batch["environment_rewards"]
return RewardSignalResult(self.strength * env_rews, env_rews)
def evaluate(
self, current_info: BrainInfo, next_info: BrainInfo
) -> RewardSignalResult:

6
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


"""
Creates the input layers for the discriminator
"""
self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
self.done_policy = tf.expand_dims(self.done_policy_holder, -1)
if self.policy_model.brain.vector_action_space_type == "continuous":
action_length = self.policy_model.act_size[0]

62
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


feed_dict[
self.policy.model.action_holder
] = next_info.previous_vector_actions
if self.policy.use_recurrent:
if current_info.memories.shape[1] == 0:
current_info.memories = self.policy.make_empty_memory(
len(current_info.agents)
)
feed_dict[self.policy.model.memory_in] = current_info.memories
unscaled_reward = self.policy.sess.run(
self.model.intrinsic_reward, feed_dict=feed_dict
)

n_epoch = self.num_epoch
for _epoch in range(n_epoch):
self.demonstration_buffer.update_buffer.shuffle()
update_buffer.shuffle()
self.demonstration_buffer.update_buffer.shuffle(
sequence_length=self.policy.sequence_length
)
update_buffer.shuffle(sequence_length=self.policy.sequence_length)
if max_batches == 0:
num_batches = possible_batches
else:

:return: Output from update process.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.done_expert: mini_batch_demo["done"].reshape([-1, 1]),
self.model.done_policy: mini_batch_policy["done"].reshape([-1, 1]),
self.model.done_expert_holder: mini_batch_demo["done"],
self.model.done_policy_holder: mini_batch_policy["done"],
feed_dict[self.model.action_in_expert] = np.array(mini_batch_demo["actions"])
feed_dict[self.policy.model.selected_actions] = mini_batch_policy[
"actions"
].reshape([-1, self.policy.model.act_size[0]])
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
[-1, self.policy.model.act_size[0]]
)
feed_dict[self.policy.model.selected_actions] = mini_batch_policy["actions"]
feed_dict[self.policy.model.action_holder] = mini_batch_policy[
"actions"
].reshape([-1, len(self.policy.model.act_size)])
feed_dict[self.model.action_in_expert] = mini_batch_demo["actions"].reshape(
[-1, len(self.policy.model.act_size)]
)
feed_dict[self.policy.model.action_holder] = mini_batch_policy["actions"]
policy_obs = mini_batch_policy["visual_obs%d" % i]
if self.policy.sequence_length > 1 and self.policy.use_recurrent:
(_batch, _seq, _w, _h, _c) = policy_obs.shape
feed_dict[self.policy.model.visual_in[i]] = policy_obs.reshape(
[-1, _w, _h, _c]
)
else:
feed_dict[self.policy.model.visual_in[i]] = policy_obs
demo_obs = mini_batch_demo["visual_obs%d" % i]
if self.policy.sequence_length > 1 and self.policy.use_recurrent:
(_batch, _seq, _w, _h, _c) = demo_obs.shape
feed_dict[self.model.expert_visual_in[i]] = demo_obs.reshape(
[-1, _w, _h, _c]
)
else:
feed_dict[self.model.expert_visual_in[i]] = demo_obs
feed_dict[self.policy.model.visual_in[i]] = mini_batch_policy[
"visual_obs%d" % i
]
feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
"visual_obs%d" % i
]
feed_dict[self.policy.model.vector_in] = mini_batch_policy[
"vector_obs"
].reshape([-1, self.policy.vec_obs_size])
feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"].reshape(
[-1, self.policy.vec_obs_size]
)
feed_dict[self.policy.model.vector_in] = mini_batch_policy["vector_obs"]
feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
out_dict = {
"gail_loss": self.model.loss,

55
ml-agents/mlagents/trainers/learn.py


from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.exception import TrainerError
from mlagents.trainers import MetaCurriculumError, MetaCurriculum
from mlagents.trainers.trainer_util import initialize_trainers
from mlagents.envs import UnityEnvironment
from mlagents.envs.sampler_class import SamplerManager
from mlagents.envs.exception import UnityEnvironmentException, SamplerException

lesson = int(run_options["--lesson"])
fast_simulation = not bool(run_options["--slow"])
no_graphics = run_options["--no-graphics"]
multi_gpu = run_options["--multi-gpu"]
trainer_config_path = run_options["<trainer-config-path>"]
sampler_file_path = (
run_options["--sampler"] if run_options["--sampler"] != "None" else None

base_port + (sub_id * num_envs),
)
env = SubprocessEnvManager(env_factory, num_envs)
maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env)
maybe_meta_curriculum = try_create_meta_curriculum(curriculum_folder, env, lesson)
trainers = initialize_trainers(
trainer_config,
env.external_brains,
summaries_dir,
run_id,
model_path,
keep_checkpoints,
train_model,
load_model,
run_seed,
maybe_meta_curriculum,
multi_gpu,
)
trainers,
load_model,
keep_checkpoints,
lesson,
run_seed,
fast_simulation,
sampler_manager,

process_queue.put(True)
# Begin training
tc.start_learning(env, trainer_config)
tc.start_learning(env)
def create_sampler_manager(sampler_file_path, env_reset_params, run_seed=None):

sampler_config = load_config(sampler_file_path)
if ("resampling-interval") in sampler_config:
if "resampling-interval" in sampler_config:
# Filter arguments that do not exist in the environment
resample_interval = sampler_config.pop("resampling-interval")
if (resample_interval <= 0) or (not isinstance(resample_interval, int)):

def try_create_meta_curriculum(
curriculum_folder: Optional[str], env: SubprocessEnvManager
curriculum_folder: Optional[str], env: SubprocessEnvManager, lesson: int
if meta_curriculum:
for brain_name in meta_curriculum.brains_to_curriculums.keys():
if brain_name not in env.external_brains.keys():
raise MetaCurriculumError(
"One of the curricula "
"defined in " + curriculum_folder + " "
"does not have a corresponding "
"Brain. Check that the "
"curriculum file has the same "
"name as the Brain "
"whose curriculum it defines."
)
# TODO: Should be able to start learning at different lesson numbers
# for each curriculum.
meta_curriculum.set_all_curriculums_to_lesson_num(lesson)
for brain_name in meta_curriculum.brains_to_curriculums.keys():
if brain_name not in env.external_brains.keys():
raise MetaCurriculumError(
"One of the curricula "
"defined in " + curriculum_folder + " "
"does not have a corresponding "
"Brain. Check that the "
"curriculum file has the same "
"name as the Brain "
"whose curriculum it defines."
)
return meta_curriculum

--docker-target-name=<dt> Docker volume to store training-specific files [default: None].
--no-graphics Whether to run the environment in no-graphics mode [default: False].
--debug Whether to run ML-Agents in debug mode with detailed logging [default: False].
--multi-gpu Whether to use multiple GPU training [default: False].
"""
options = docopt(_USAGE)

258
ml-agents/mlagents/trainers/models.py


import logging
from enum import Enum
from typing import Any, Callable, Dict
from typing import Any, Callable, Dict, List
import numpy as np
import tensorflow as tf

ActivationFunction = Callable[[tf.Tensor], tf.Tensor]
EPSILON = 1e-7
class EncoderType(Enum):

:param all_logits: The concatenated unnormalized action probabilities for all branches
:param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
:param action_size: A list containing the number of possible actions for each branch
:return: The action output dimension [batch_size, num_branches] and the concatenated normalized logits
:return: The action output dimension [batch_size, num_branches], the concatenated
normalized probs (after softmax)
and the concatenated normalized log probs
"""
action_idx = [0] + list(np.cumsum(action_size))
branches_logits = [

for i in range(len(action_size))
]
raw_probs = [
tf.multiply(tf.nn.softmax(branches_logits[k]) + 1.0e-10, branch_masks[k])
tf.multiply(tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
for k in range(len(action_size))
]
normalized_probs = [

output = tf.concat(
[
tf.multinomial(tf.log(normalized_probs[k]), 1)
tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
for k in range(len(action_size))
],
axis=1,

tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1),
tf.log(normalized_probs[k] + 1.0e-10)
tf.log(normalized_probs[k] + EPSILON)
for k in range(len(action_size))
],
axis=1,

h_size: int,
num_layers: int,
vis_encode_type: EncoderType = EncoderType.SIMPLE,
stream_scopes: List[str] = None,
) -> tf.Tensor:
"""
Creates encoding stream for observations.

:param stream_scopes: List of strings (length == num_streams), which contains
the scopes for each of the streams. None if all under the same TF scope.
:return: List of encoded streams.
"""
brain = self.brain

for i in range(num_streams):
visual_encoders = []
hidden_state, hidden_visual = None, None
_scope_add = stream_scopes[i] if stream_scopes else ""
if self.vis_obs_size > 0:
if vis_encode_type == EncoderType.RESNET:
for j in range(brain.number_visual_observations):

activation_fn,
num_layers,
"main_graph_{}_encoder{}".format(i, j),
_scope_add + "main_graph_{}_encoder{}".format(i, j),
False,
)
visual_encoders.append(encoded_visual)

h_size,
activation_fn,
num_layers,
"main_graph_{}_encoder{}".format(i, j),
_scope_add + "main_graph_{}_encoder{}".format(i, j),
False,
)
visual_encoders.append(encoded_visual)

h_size,
activation_fn,
num_layers,
"main_graph_{}_encoder{}".format(i, j),
_scope_add + "main_graph_{}_encoder{}".format(i, j),
False,
)
visual_encoders.append(encoded_visual)

h_size,
activation_fn,
num_layers,
"main_graph_{}".format(i),
_scope_add + "main_graph_{}".format(i),
False,
)
if hidden_state is not None and hidden_visual is not None:

value = tf.layers.dense(hidden_input, 1, name="{}_value".format(name))
self.value_heads[name] = value
self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
def create_cc_actor_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_streams = self.create_observation_streams(
2, h_size, num_layers, vis_encode_type
)
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
hidden_policy, memory_policy_out = self.create_recurrent_encoder(
hidden_streams[0],
self.memory_in[:, :_half_point],
self.sequence_length,
name="lstm_policy",
)
hidden_value, memory_value_out = self.create_recurrent_encoder(
hidden_streams[1],
self.memory_in[:, _half_point:],
self.sequence_length,
name="lstm_value",
)
self.memory_out = tf.concat(
[memory_policy_out, memory_value_out], axis=1, name="recurrent_out"
)
else:
hidden_policy = hidden_streams[0]
hidden_value = hidden_streams[1]
mu = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01),
)
self.log_sigma_sq = tf.get_variable(
"log_sigma_squared",
[self.act_size[0]],
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
# Clip and scale output to ensure actions are always within [-1, 1] range.
self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(output_post)
# Compute probability of model output.
all_probs = (
-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
- 0.5 * tf.log(2.0 * np.pi)
- 0.5 * self.log_sigma_sq
)
self.all_log_probs = tf.identity(all_probs, name="action_probs")
self.entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + self.log_sigma_sq
)
self.create_value_heads(self.stream_names, hidden_value)
self.all_old_log_probs = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities"
)
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.log_probs = tf.reduce_sum(
(tf.identity(self.all_log_probs)), axis=1, keepdims=True
)
self.old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
def create_dc_actor_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_streams = self.create_observation_streams(
1, h_size, num_layers, vis_encode_type
)
hidden = hidden_streams[0]
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden = tf.concat([hidden, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden, memory_out = self.create_recurrent_encoder(
hidden, self.memory_in, self.sequence_length
)
self.memory_out = tf.identity(memory_out, name="recurrent_out")
policy_branches = []
for size in self.act_size:
policy_branches.append(
tf.layers.dense(
hidden,
size,
activation=None,
use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(
factor=0.01
),
)
)
self.all_log_probs = tf.concat(
[branch for branch in policy_branches], axis=1, name="action_probs"
)
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
output, normalized_logits = self.create_discrete_action_masking_layer(
self.all_log_probs, self.action_masks, self.act_size
)
self.output = tf.identity(output)
self.normalized_logits = tf.identity(normalized_logits, name="action")
self.create_value_heads(self.stream_names, hidden)
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
)
self.action_oh = tf.concat(
[
tf.one_hot(self.action_holder[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
self.selected_actions = tf.stop_gradient(self.action_oh)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
)
_, old_normalized_logits = self.create_discrete_action_masking_layer(
self.all_old_log_probs, self.action_masks, self.act_size
)
action_idx = [0] + list(np.cumsum(self.act_size))
self.entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
)
self.log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)
self.old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.action_oh[:, action_idx[i] : action_idx[i + 1]],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)

241
ml-agents/mlagents/trainers/ppo/models.py


max_step,
)
def create_cc_actor_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_streams = self.create_observation_streams(
2, h_size, num_layers, vis_encode_type
)
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
_half_point = int(self.m_size / 2)
hidden_policy, memory_policy_out = self.create_recurrent_encoder(
hidden_streams[0],
self.memory_in[:, :_half_point],
self.sequence_length,
name="lstm_policy",
)
hidden_value, memory_value_out = self.create_recurrent_encoder(
hidden_streams[1],
self.memory_in[:, _half_point:],
self.sequence_length,
name="lstm_value",
)
self.memory_out = tf.concat(
[memory_policy_out, memory_value_out], axis=1, name="recurrent_out"
)
else:
hidden_policy = hidden_streams[0]
hidden_value = hidden_streams[1]
mu = tf.layers.dense(
hidden_policy,
self.act_size[0],
activation=None,
kernel_initializer=LearningModel.scaled_init(0.01),
)
self.log_sigma_sq = tf.get_variable(
"log_sigma_squared",
[self.act_size[0]],
dtype=tf.float32,
initializer=tf.zeros_initializer(),
)
sigma_sq = tf.exp(self.log_sigma_sq)
self.epsilon = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
)
# Clip and scale output to ensure actions are always within [-1, 1] range.
self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(output_post)
# Compute probability of model output.
all_probs = (
-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq
- 0.5 * tf.log(2.0 * np.pi)
- 0.5 * self.log_sigma_sq
)
self.all_log_probs = tf.identity(all_probs, name="action_probs")
self.entropy = 0.5 * tf.reduce_mean(
tf.log(2 * np.pi * np.e) + self.log_sigma_sq
)
self.create_value_heads(self.stream_names, hidden_value)
self.all_old_log_probs = tf.placeholder(
shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities"
)
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.log_probs = tf.reduce_sum(
(tf.identity(self.all_log_probs)), axis=1, keepdims=True
)
self.old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
def create_dc_actor_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
"""
hidden_streams = self.create_observation_streams(
1, h_size, num_layers, vis_encode_type
)
hidden = hidden_streams[0]
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden = tf.concat([hidden, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden, memory_out = self.create_recurrent_encoder(
hidden, self.memory_in, self.sequence_length
)
self.memory_out = tf.identity(memory_out, name="recurrent_out")
policy_branches = []
for size in self.act_size:
policy_branches.append(
tf.layers.dense(
hidden,
size,
activation=None,
use_bias=False,
kernel_initializer=LearningModel.scaled_init(0.01),
)
)
self.all_log_probs = tf.concat(
[branch for branch in policy_branches], axis=1, name="action_probs"
)
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
output, _, normalized_logits = self.create_discrete_action_masking_layer(
self.all_log_probs, self.action_masks, self.act_size
)
self.output = tf.identity(output)
self.normalized_logits = tf.identity(normalized_logits, name="action")
self.create_value_heads(self.stream_names, hidden)
self.action_holder = tf.placeholder(
shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder"
)
self.action_oh = tf.concat(
[
tf.one_hot(self.action_holder[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
self.selected_actions = tf.stop_gradient(self.action_oh)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities"
)
_, _, old_normalized_logits = self.create_discrete_action_masking_layer(
self.all_old_log_probs, self.action_masks, self.act_size
)
action_idx = [0] + list(np.cumsum(self.act_size))
self.entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
self.all_log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=self.all_log_probs[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.act_size))
],
axis=1,
)
),
axis=1,
)
self.log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(