浏览代码

State Stacking & Banan Environment (#262)

* Add support for stacking past n states to allow network to learn temporal dependencies.
* Add Banana Collector environment for demonstrating partially observable multi-agent environments.
* Add 3DBall Hard which lacks velocity information in state representation. Used as test for LSTM and state-stacking features.
* Rework Tennis environment to be continuous control and trainable in 100k steps.
/develop-generalizationTraining-TrainerController
GitHub 7 年前
当前提交
51621334
共有 91 个文件被更改,包括 12262 次插入2017 次删除
  1. 26
      docs/Example-Environments.md
  2. 77
      python/learn.py
  3. 10
      python/trainer_configurations.yaml
  4. 289
      python/trainers/buffer.py
  5. 39
      python/trainers/ghost_trainer.py
  6. 2
      python/trainers/imitation_trainer.py
  7. 124
      python/trainers/ppo_models.py
  8. 110
      python/trainers/ppo_trainer.py
  9. 13
      python/trainers/trainer.py
  10. 13
      python/unityagents/brain.py
  11. 6
      python/unityagents/environment.py
  12. 54
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  13. 2
      unity-environment/Assets/ML-Agents/Examples/Tennis/Materials/NetMat.mat
  14. 99
      unity-environment/Assets/ML-Agents/Examples/Tennis/Prefabs/TennisArea.prefab
  15. 39
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  16. 9
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisArea.cs
  17. 71
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/hitWall.cs
  18. 642
      unity-environment/Assets/ML-Agents/Examples/Tennis/TFModels/Tennis.bytes
  19. 5
      unity-environment/Assets/ML-Agents/Examples/Tennis/TFModels/Tennis.bytes.meta
  20. 776
      unity-environment/Assets/ML-Agents/Examples/Tennis/Tennis.unity
  21. 13
      unity-environment/Assets/ML-Agents/Scripts/Agent.cs
  22. 11
      unity-environment/Assets/ML-Agents/Scripts/Brain.cs
  23. 4
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
  24. 5
      unity-environment/ProjectSettings/TagManager.asset
  25. 1001
      images/banana.png
  26. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DHardScene.unity
  27. 9
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DHardScene.unity.meta
  28. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DScene.unity
  29. 9
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DScene.unity.meta
  30. 341
      unity-environment/Assets/ML-Agents/Examples/3DBall/Prefabs/GameHard.prefab
  31. 10
      unity-environment/Assets/ML-Agents/Examples/3DBall/Prefabs/GameHard.prefab.meta
  32. 59
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
  33. 13
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs.meta
  34. 802
      unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHard.bytes
  35. 9
      unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHard.bytes.meta
  36. 10
      unity-environment/Assets/ML-Agents/Examples/Banana.meta
  37. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/3D Models.meta
  38. 45
      unity-environment/Assets/ML-Agents/Examples/Banana/3D Models/banana.fbx
  39. 93
      unity-environment/Assets/ML-Agents/Examples/Banana/3D Models/banana.fbx.meta
  40. 1001
      unity-environment/Assets/ML-Agents/Examples/Banana/Banana.unity
  41. 8
      unity-environment/Assets/ML-Agents/Examples/Banana/Banana.unity.meta
  42. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials.meta
  43. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/agent.mat
  44. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/agent.mat.meta
  45. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/bad.mat
  46. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/bad.mat.meta
  47. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/black.mat
  48. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/black.mat.meta
  49. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/ground.mat
  50. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/ground.mat.meta
  51. 77
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/lazer.mat
  52. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/lazer.mat.meta
  53. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/red.mat
  54. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/red.mat.meta
  55. 865
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/swatch Master.psd
  56. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/swatch Master.psd.meta
  57. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/swatch.mat
  58. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/swatch.mat.meta
  59. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/wall.mat
  60. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/wall.mat.meta
  61. 76
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/white.mat
  62. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Materials/white.mat.meta
  63. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs.meta
  64. 1001
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/Agent 1.prefab
  65. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/Agent 1.prefab.meta
  66. 1001
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/AreaPB.prefab
  67. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/AreaPB.prefab.meta
  68. 126
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/BAD BANANA.prefab
  69. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/BAD BANANA.prefab.meta
  70. 126
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/BANANA.prefab
  71. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/BANANA.prefab.meta
  72. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts.meta
  73. 34
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaAcademy.cs
  74. 13
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaAcademy.cs.meta
  75. 184
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaAgent.cs
  76. 12
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaAgent.cs.meta
  77. 53
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaArea.cs
  78. 13
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaArea.cs.meta
  79. 27
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaLogic.cs
  80. 13
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaLogic.cs.meta
  81. 10
      unity-environment/Assets/ML-Agents/Examples/Banana/TFModels.meta
  82. 1001
      unity-environment/Assets/ML-Agents/Examples/Banana/TFModels/Banana-fight.bytes
  83. 9
      unity-environment/Assets/ML-Agents/Examples/Banana/TFModels/Banana-fight.bytes.meta
  84. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scene.unity
  85. 8
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scene.unity.meta

26
docs/Example-Environments.md


* -1.0 if the ball falls from the platform.
* Brains: One brain with the following state/action space.
* State space: (Continuous) 8 variables corresponding to rotation of platform, and position, rotation, and velocity of ball.
* State space (Hard Version): (Continuous) 5 variables corresponding to rotation of platform and position and rotation of ball.
* Action space: (Continuous) Size of 2, with one value corresponding to X-rotation, and the other to Z-rotation.
* Observations: 0
* Reset Parameters: None

* Goal: The agents must bounce ball between one another while not dropping or sending ball out of bounds.
* Agents: The environment contains two agent linked to a single brain.
* Agent Reward Function (independent):
* -0.1 To last agent to hit ball before going out of bounds or hitting ground/net (episode ends).
* +0.1 To agent when hitting ball after ball was hit by the other agent.
* +0.1 To agent who didn't hit ball last when ball hits ground.
* +0.1 To agent when hitting ball over net.
* -0.1 To agent who let ball hit their ground, or hit ball out of bounds.
* Action space: (Discrete) Size of 4, corresponding to movement toward net, away from net, jumping, and no-movement.
* Action space: (Continuous) Size of 2, corresponding to movement toward net or away from net, and jumping.
* Observations: None
* Reset Parameters: One, corresponding to size of ball.

* State space: (Continuous) 117 variables corresponding to position, rotation, velocity, and angular velocities of each limb plus the acceleration and angular acceleration of the body.
* Action space: (Continuous) Size of 12, corresponding to torque applicable to 12 joints.
* Observations: None
* Reset Parameters: None
* Reset Parameters: None
## Banana Collector
![Banana](../images/banana.png)
* Set-up: A multi-agent environment where agents compete to collect bananas.
* Goal: The agents must learn to move to as many yellow bananas as possible while avoiding red bananas.
* Agents: The environment contains 10 agents linked to a single brain.
* Agent Reward Function (independent):
* +1 for interaction with yellow banana
* -1 for interaction with red banana.
* Brains: One brain with the following state/action space.
* State space: (Continuous) 51 corresponding to velocity of agent, plus ray-based perception of objects around agent's forward direction.
* Action space: (Continuous) Size of 3, corresponding to forward movement, y-axis rotation, and whether to use laser to disable other agents.
* Observations (Optional): First-person view for each agent.
* Reset Parameters: None

77
python/learn.py


from trainers.imitation_trainer import ImitationTrainer
from unityagents import UnityEnvironment, UnityEnvironmentException
def get_progress():
if curriculum_file is not None:
if env.curriculum.measure_type == "progress":

elif env.curriculum.measure_type == "reward":
progress = 0
for brain_name in env.external_brain_names:
progress += trainers[brain_name].get_last_reward
progress += trainers[brain_name].get_last_reward
return progress
else:
return None

if __name__ == '__main__' :
if __name__ == '__main__':
logger = logging.getLogger("unityagents")
_USAGE = '''
Usage:

os.makedirs(model_path)
except:
raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
" Please make sure the permissions are set correctly.".format(model_path))
" Please make sure the permissions are set correctly.".format(model_path))
try:
with open("trainer_configurations.yaml") as data_file:

.format("trainer_configurations.yaml"))
.format("trainer_configurations.yaml"))
except UnicodeDecodeError:
raise UnityEnvironmentException("There was an error decoding {}".format("trainer_configurations.yaml"))

if len(env.external_brain_names) > 1:
graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
trainer_parameters['graph_scope'] = graph_scope
trainer_parameters['summary_path'] = './summaries/{}'.format(str(options['--run-path']))+'_'+graph_scope
else :
trainer_parameters['summary_path'] = './summaries/{}'.format(
str(options['--run-path'])) + '_' + graph_scope
else:
trainer_parameters['graph_scope'] = ''
trainer_parameters['summary_path'] = './summaries/{}'.format(str(options['--run-path']))
if brain_name in trainer_configurations:

if trainer_parameters_dict[brain_name]['is_ghost']:
if trainer_parameters_dict[brain_name]['brain_to_copy'] not in env.external_brain_names:
raise UnityEnvironmentException("The external brain {0} could not be found in the environment "
"even though the ghost trainer of brain {1} is trying to ghost it."
.format(trainer_parameters_dict[brain_name]['brain_to_copy'], brain_name))
"even though the ghost trainer of brain {1} is trying to ghost it."
.format(trainer_parameters_dict[brain_name]['brain_to_copy'],
brain_name))
trainers[brain_name] = GhostTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name], train_model)
trainers[brain_name] = GhostTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model)
trainers[brain_name] = ImitationTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name], train_model)
trainers[brain_name] = ImitationTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model)
trainers[brain_name] = PPOTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name], train_model)
trainers[brain_name] = PPOTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model)
for k, t in trainers.items():
logger.info(t)

if load_model:
logger.info('Loading Model...')
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt == None:
logger.info('The model {0} could not be found. Make sure you specified the right '
'--run-path'.format(model_path))
if ckpt is None:
logger.info('The model {0} could not be found. Make sure you specified the right '
'--run-path'.format(model_path))
global_step = 0 # This is only for saving the model
global_step = 0 # This is only for saving the model
info = env.reset(train_mode= fast_simulation)
info = env.reset(train_mode=fast_simulation)
trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
try:
while any([t.get_step <= t.get_max_steps for k, t in trainers.items()]) or not train_model:
if env.global_done:

trainer.end_episode()
# Decide and take an action
take_action_actions = {}
take_action_memories = {}
take_action_values = {}
take_action_outputs = {}
take_action_actions, take_action_memories, take_action_values, take_action_outputs = {}, {}, {}, {}
take_action_memories[brain_name],
take_action_values[brain_name],
take_action_outputs[brain_name]) = trainer.take_action(info)
new_info = env.step(action = take_action_actions, memory = take_action_memories, value = take_action_values)
take_action_memories[brain_name],
take_action_values[brain_name],
take_action_outputs[brain_name]) = trainer.take_action(info)
new_info = env.step(action=take_action_actions, memory=take_action_memories, value=take_action_values)
for brain_name, trainer in trainers.items():
trainer.add_experiences(info, new_info, take_action_outputs[brain_name])
info = new_info

if global_step != 0 and train_model:
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
except KeyboardInterrupt:
if train_model:
logger.info("Learning was interupted. Please wait while the graph is generated.")
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
pass
if train_model:
logger.info("Learning was interrupted. Please wait while the graph is generated.")
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
pass
.replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
.replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
graph_name = os.path.basename(os.path.normpath(graph_name))
nodes = []
scopes = []

if scope == '/':
scope = ''
scope = ''
nodes +=[scope + x for x in ["action"]]
nodes += [scope + x for x in ["action"]]
nodes +=[scope + x for x in ["action","value_estimate","action_probs"]]
nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
nodes +=[scope + x for x in ["action","value_estimate","action_probs","recurrent_out"]]
nodes += [scope + x for x in ["action", "value_estimate", "action_probs", "recurrent_out"]]
logger.info("\t" + scope )
logger.info("\t" + scope)
logger.info("\t" + n)
logger.info("\t" + n)

10
python/trainer_configurations.yaml


default:
batch_size: 64
batch_size: 256
buffer_size: 2048
buffer_size: 5000
hidden_units: 64
hidden_units: 128
lambd: 0.95
learning_rate: 3.0e-4
max_steps: 1.0e6

time_horizon: 2048
time_horizon: 64
summary_freq: 10000
summary_freq: 1000
use_recurrent: false
Ball3DBrain:

289
python/trainers/buffer.py


from unityagents.exception import UnityException
class BufferException(UnityException):
"""
Related to errors with the Buffer.

"""
"""
class AgentBuffer(dict):
"""
class AgentBuffer(dict):
"""
class AgentBufferField(list):
"""
class AgentBufferField(list):
"""
def __str__(self):
return str(np.array(self).shape)
def __str__(self):
return str(np.array(self).shape)
def extend(self, data):
"""
def extend(self, data):
"""
self += list(np.array(data))
self += list(np.array(data))
def set(self, data):
"""
def set(self, data):
"""
self[:] = []
self[:] = list(np.array(data))
self[:] = []
self[:] = list(np.array(data))
def get_batch(self, batch_size = None, training_length = None, sequential = True):
"""
def get_batch(self, batch_size=None, training_length=None, sequential=True):
"""
Retrieve the last batch_size elements of length training_length
from the list of np.array
:param batch_size: The number of elements to retrieve. If None:

sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
[[a,b],[b,c],[c,d],[d,e]]
"""
if training_length is None:
# When the training length is None, the method returns a list of elements,
# not a list of sequences of elements.
if batch_size is None:
# If batch_size is None : All the elements of the AgentBufferField are returned.
return np.array(self)
else:
# return the batch_size last elements
if batch_size > len(self):
raise BufferException("Batch size requested is too large")
return np.array(self[-batch_size:])
else:
# The training_length is not None, the method returns a list of SEQUENCES of elements
if not sequential:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size :
raise BufferException("The batch size and training length requested for get_batch where"
" too large given the current number of data points.")
return
tmp_list = []
for end in range(len(self)-batch_size+1, len(self)+1):
tmp_list += [np.array(self[end-training_length:end])]
return np.array(tmp_list)
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length +1 *(leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (len(self) // training_length +1 *(leftover != 0)):
raise BufferException("The batch size and training length requested for get_batch where"
" too large given the current number of data points.")
return
tmp_list = []
padding = np.array(self[-1]) * 0
# The padding is made with zeros and its shape is given by the shape of the last element
for end in range(len(self), len(self) % training_length , -training_length)[:batch_size]:
tmp_list += [np.array(self[end-training_length:end])]
if (leftover != 0) and (len(tmp_list) < batch_size):
tmp_list +=[np.array([padding]*(training_length - leftover)+self[:leftover])]
tmp_list.reverse()
return np.array(tmp_list)
if training_length is None:
# When the training length is None, the method returns a list of elements,
# not a list of sequences of elements.
if batch_size is None:
# If batch_size is None : All the elements of the AgentBufferField are returned.
return np.array(self)
else:
# return the batch_size last elements
if batch_size > len(self):
raise BufferException("Batch size requested is too large")
return np.array(self[-batch_size:])
else:
# The training_length is not None, the method returns a list of SEQUENCES of elements
if not sequential:
# The sequences will have overlapping elements
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) - training_length + 1
# The number of sequences of length training_length taken from a list of len(self) elements
# with overlapping is equal to batch_size
if (len(self) - training_length + 1) < batch_size:
raise BufferException("The batch size and training length requested for get_batch where"
" too large given the current number of data points.")
return
tmp_list = []
for end in range(len(self) - batch_size + 1, len(self) + 1):
tmp_list += [np.array(self[end - training_length:end])]
return np.array(tmp_list)
if sequential:
# The sequences will not have overlapping elements (this involves padding)
leftover = len(self) % training_length
# leftover is the number of elements in the first sequence (this sequence might need 0 padding)
if batch_size is None:
# retrieve the maximum number of elements
batch_size = len(self) // training_length + 1 * (leftover != 0)
# The maximum number of sequences taken from a list of length len(self) without overlapping
# with padding is equal to batch_size
if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
raise BufferException("The batch size and training length requested for get_batch where"
" too large given the current number of data points.")
return
tmp_list = []
padding = np.array(self[-1]) * 0
# The padding is made with zeros and its shape is given by the shape of the last element
for end in range(len(self), len(self) % training_length, -training_length)[:batch_size]:
tmp_list += [np.array(self[end - training_length:end])]
if (leftover != 0) and (len(tmp_list) < batch_size):
tmp_list += [np.array([padding] * (training_length - leftover) + self[:leftover])]
tmp_list.reverse()
return np.array(tmp_list)
def reset_field(self):
"""
def reset_field(self):
"""
self[:] = []
self[:] = []
def __str__(self):
return ", ".join(["'{0}' : {1}".format(k, str(self[k])) for k in self.keys()])
def __str__(self):
return ", ".join(["'{0}' : {1}".format(k, str(self[k])) for k in self.keys()])
def reset_agent(self):
"""
def reset_agent(self):
"""
for k in self.keys():
self[k].reset_field()
for k in self.keys():
self[k].reset_field()
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBufferField()
return super(Buffer.AgentBuffer, self).__getitem__(key)
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBufferField()
return super(Buffer.AgentBuffer, self).__getitem__(key)
def check_length(self, key_list):
"""
def check_length(self, key_list):
"""
if len(key_list) < 2:
return True
l = None
for key in key_list:
if key not in self.keys():
return False
if ((l != None) and (l!=len(self[key]))):
return False
l = len(self[key])
return True
if len(key_list) < 2:
return True
l = None
for key in key_list:
if key not in self.keys():
return False
if ((l != None) and (l != len(self[key]))):
return False
l = len(self[key])
return True
def shuffle(self, key_list = None):
"""
def shuffle(self, key_list=None):
"""
if key_list is None:
key_list = list(self.keys())
if not self.check_length(key_list):
raise BufferException("Unable to shuffle if the fields are not of same length")
return
s = np.arange(len(self[key_list[0]]))
np.random.shuffle(s)
for key in key_list:
self[key][:] = [self[key][i] for i in s]
if key_list is None:
key_list = list(self.keys())
if not self.check_length(key_list):
raise BufferException("Unable to shuffle if the fields are not of same length")
return
s = np.arange(len(self[key_list[0]]))
np.random.shuffle(s)
for key in key_list:
self[key][:] = [self[key][i] for i in s]
def __init__(self):
self.update_buffer = self.AgentBuffer()
super(Buffer, self).__init__()
def __init__(self):
self.update_buffer = self.AgentBuffer()
super(Buffer, self).__init__()
def __str__(self):
return "update buffer :\n\t{0}\nlocal_buffers :\n{1}".format(str(self.update_buffer),
'\n'.join(['\tagent {0} :{1}'.format(k, str(self[k])) for k in self.keys()]))
def __str__(self):
return "update buffer :\n\t{0}\nlocal_buffers :\n{1}".format(str(self.update_buffer),
'\n'.join(
['\tagent {0} :{1}'.format(k, str(self[k])) for
k in self.keys()]))
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBuffer()
return super(Buffer, self).__getitem__(key)
def __getitem__(self, key):
if key not in self.keys():
self[key] = self.AgentBuffer()
return super(Buffer, self).__getitem__(key)
def reset_update_buffer(self):
"""
def reset_update_buffer(self):
"""
self.update_buffer.reset_agent()
self.update_buffer.reset_agent()
def reset_all(self):
"""
def reset_all(self):
"""
self.update_buffer.reset_agent()
agent_ids = list(self.keys())
for k in agent_ids:
self[k].reset_agent()
# self.update_buffer.reset_agent()
agent_ids = list(self.keys())
for k in agent_ids:
self[k].reset_agent()
def append_update_buffer(self, agent_id ,key_list = None, batch_size = None, training_length = None):
"""
def append_update_buffer(self, agent_id, key_list=None, batch_size=None, training_length=None):
"""
Appends the buffer of an agent to the update buffer.
:param agent_id: The id of the agent which data will be appended
:param key_list: The fields that must be added. If None: all fields will be appended.

if key_list is None:
key_list = self[agent_id].keys()
if not self[agent_id].check_length(key_list):
raise BufferException("The length of the fields {0} for agent {1} where not of same length"
.format(key_list, agent_id))
for field_key in key_list:
self.update_buffer[field_key].extend(
self[agent_id][field_key].get_batch(batch_size =batch_size, training_length =training_length)
)
if key_list is None:
key_list = self[agent_id].keys()
if not self[agent_id].check_length(key_list):
raise BufferException("The length of the fields {0} for agent {1} where not of same length"
.format(key_list, agent_id))
for field_key in key_list:
self.update_buffer[field_key].extend(
self[agent_id][field_key].get_batch(batch_size=batch_size, training_length=training_length)
)
def append_all_agent_batch_to_update_buffer(self, key_list = None, batch_size = None, training_length = None):
"""
def append_all_agent_batch_to_update_buffer(self, key_list=None, batch_size=None, training_length=None):
"""
for agent_id in self.keys():
self.append_update_buffer(agent_id ,key_list, batch_size, training_length)
for agent_id in self.keys():
self.append_update_buffer(agent_id, key_list, batch_size, training_length)

39
python/trainers/ghost_trainer.py


logger = logging.getLogger("unityagents")
#This works only with PPO
# This works only with PPO
def __init__(self, sess, env, brain_name, trainer_parameters, training):
"""
Responsible for saving and reusing past models.

for k in self.param_keys:
if k not in trainer_parameters:
raise UnityTrainerException("The hyperparameter {0} could not be found for the PPO trainer of "
"brain {1}.".format(k, brain_name))
"brain {1}.".format(k, brain_name))
super(GhostTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training)

self.max_num_models = trainer_parameters['max_num_models']
self.last_model_replaced = 0
for i in range(self.max_num_models):
with tf.variable_scope(self.variable_scope+'_'+str(i)):
self.models += [create_agent_model(env.brains[self.brain_to_copy],
lr=float(self.original_brain_parameters['learning_rate']),
h_size=int(self.original_brain_parameters['hidden_units']),
epsilon=float(self.original_brain_parameters['epsilon']),
beta=float(self.original_brain_parameters['beta']),
max_step=float(self.original_brain_parameters['max_steps']),
normalize=self.original_brain_parameters['normalize'],
use_recurrent=self.original_brain_parameters['use_recurrent'],
num_layers=int(self.original_brain_parameters['num_layers']),
m_size = self.original_brain_parameters)]
with tf.variable_scope(self.variable_scope + '_' + str(i)):
self.models += [create_agent_model(env.brains[self.brain_to_copy],
lr=float(self.original_brain_parameters['learning_rate']),
h_size=int(self.original_brain_parameters['hidden_units']),
epsilon=float(self.original_brain_parameters['epsilon']),
beta=float(self.original_brain_parameters['beta']),
max_step=float(self.original_brain_parameters['max_steps']),
normalize=self.original_brain_parameters['normalize'],
use_recurrent=self.original_brain_parameters['use_recurrent'],
num_layers=int(self.original_brain_parameters['num_layers']),
m_size=self.original_brain_parameters)]
self.is_continuous = (env.brains[brain_name].action_space_type == "continuous")
self.use_observations = (env.brains[brain_name].number_observations > 0)

@property
def parameters(self):
"""

from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
op_holder = []
for from_var,to_var in zip(from_vars,to_vars):
for from_var, to_var in zip(from_vars, to_vars):
op_holder.append(to_var.assign(from_var))
return op_holder

to be passed to add experiences
"""
epsi = None
info = info[self.brain_name]
feed_dict = {self.model.batch_size: len(info.states), self.model.sequence_length: 1}

self.last_model_replaced = (self.last_model_replaced + 1) % self.max_num_models
self.sess.run(self.update_target_graph(
self.original_brain_parameters['graph_scope'],
self.variable_scope+'_'+str(self.last_model_replaced))
self.variable_scope + '_' + str(self.last_model_replaced))
)
return

:param lesson_number: The lesson the trainer is at.
"""
return

2
python/trainers/imitation_trainer.py


os.makedirs(self.summary_path)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
s_size = self.brain.state_space_size * 1#brain_parameters.stacked_states
s_size = self.brain.state_space_size * self.brain.stacked_states
a_size = self.brain.action_space_size
with tf.variable_scope(self.variable_scope):
self.network = ImitationNN(state_size = s_size,

124
python/trainers/ppo_models.py


logger = logging.getLogger("unityagents")
def create_agent_model(brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent = False, num_layers=2, m_size = None):
def create_agent_model(brain, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6,
normalize=False, use_recurrent=False, num_layers=2, m_size=None):
:param env: a Unity environment.
:param brain: BrainInfo used to generate specific network graph.
:param lr: Learning rate.
:param h_size: Size of hidden layers/
:param epsilon: Value for policy-divergence threshold.

:param normalize: Whether to normalize vector observation input.
:param use_recurrent: Whether to use an LSTM layer in the network.
:param num_layers Number of hidden layers between encoded input and policy & value layers
if num_layers < 1: num_layers = 1
if num_layers < 1:
num_layers = 1
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, use_recurrent, num_layers, m_size)
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, use_recurrent, num_layers,
m_size)
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, use_recurrent, num_layers, m_size)
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, use_recurrent, num_layers,
m_size)
def save_model(sess, saver, model_path="./", steps=0):

class PPOModel(object):
def __init__(self):
def __init__(self, m_size, normalize, use_recurrent):
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
self.m_size = m_size
self.global_step, self.increment_step = self.create_global_steps()
self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
self.normalize = normalize
self.use_recurrent = use_recurrent
self.state_in = None
def create_global_steps(self):
@staticmethod
def create_global_steps():
self.global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32)
self.increment_step = tf.assign(self.global_step, self.global_step + 1)
global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32)
increment_step = tf.assign(global_step, tf.add(global_step, 1))
return global_step, increment_step
def create_reward_encoder(self):
@staticmethod
def create_reward_encoder():
self.last_reward = tf.Variable(0, name="last_reward", trainable=False, dtype=tf.float32)
self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
self.update_reward = tf.assign(self.last_reward, self.new_reward)
last_reward = tf.Variable(0, name="last_reward", trainable=False, dtype=tf.float32)
new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
update_reward = tf.assign(last_reward, new_reward)
return last_reward, new_reward, update_reward
:param s_size: Dimension of the input tensor.
:param memory_in: The input memory to the LSTM cell.
:param name: The scope of the LSTM cell.
"""
s_size = input_state.get_shape().as_list()[1]
m_size = memory_in.get_shape().as_list()[1]

:param h_size: Hidden layer size.
:param num_streams: Number of visual streams to construct.
:param activation: What type of activation function to use for layers.
:param num_layers: number of hidden layers to create.
:return: List of hidden layer tensors.
"""
if bw:

self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_%d' % len(self.observation_in)))
name='observation_%d' % len(self.observation_in)))
self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
use_bias=False, activation=activation)
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
use_bias=False, activation=activation)
hidden = c_layers.flatten(self.conv2)
conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
activation=activation)
conv2 = tf.layers.conv2d(conv1, 32, kernel_size=[4, 4], strides=[2, 2],
activation=activation)
hidden = c_layers.flatten(conv2)
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
streams.append(hidden)
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
streams.append(hidden)
return streams
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):

:param h_size: Hidden layer size.
:param num_streams: Number of state streams to construct.
:param activation: What type of activation function to use for layers.
:param num_layers: number of hidden layers to create.
:return: List of hidden layer tensors.
"""
self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state')

for i in range(num_streams):
hidden = self.normalized_state
for j in range(num_layers):
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
hidden = tf.layers.dense(hidden, h_size, activation=activation,
kernel_initializer=c_layers.variance_scaling_initializer(1.0))
streams.append(hidden)
return streams

:param h_size: Hidden layer size.
:param num_streams: Number of state streams to construct.
:param activation: What type of activation function to use for layers.
:param num_layers: number of hidden layers to create.
:return: List of hidden layer tensors.
"""
self.state_in = tf.placeholder(shape=[None, 1], dtype=tf.int32, name='state')

decay_beta = tf.train.polynomial_decay(beta, self.global_step,
max_step, 1e-5,
power=1.0)
self.loss = self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean(entropy)
self.learning_rate = tf.train.polynomial_decay(lr, self.global_step,

class ContinuousControlModel(PPOModel):
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, use_recurrent, num_layers,m_size):
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, use_recurrent, num_layers, m_size):
self.m_size = m_size
super(ContinuousControlModel, self).__init__()
s_size = brain.state_space_size
super(ContinuousControlModel, self).__init__(m_size, normalize, use_recurrent)
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
self.normalize = normalize
self.use_recurrent = use_recurrent
self.create_global_steps()
self.create_reward_encoder()
hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
if brain.number_observations > 0:
visual_encoder_0 = []

bw = brain.camera_resolutions[i]['blackAndWhite']
encoded_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
encoded_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh,
num_layers)
s_size = brain.state_space_size
s_size = brain.state_space_size * brain.stacked_states
if brain.state_space_type == "continuous":
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
else:

self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name = 'recurrent_out')
self.mu = tf.layers.dense(hidden_policy, a_size, activation=None, use_bias=False,
self.log_sigma_sq = tf.get_variable("log_sigma_squared", [a_size], dtype=tf.float32,
initializer=tf.zeros_initializer())
self.sigma_sq = tf.exp(self.log_sigma_sq)

self.entropy = tf.reduce_sum(0.5 * tf.log(2 * np.pi * np.e * self.sigma_sq))
self.value = tf.layers.dense(hidden_value, 1, activation=None, use_bias=False)
self.value = tf.layers.dense(hidden_value, 1, activation=None)
self.value = tf.identity(self.value, name="value_estimate")
self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities')

class DiscreteControlModel(PPOModel):
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize,use_recurrent, num_layers,m_size):
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, use_recurrent, num_layers, m_size):
self.m_size = m_size
super(DiscreteControlModel, self).__init__()
self.create_global_steps()
self.create_reward_encoder()
self.normalize = normalize
self.use_recurrent = use_recurrent
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
super(DiscreteControlModel, self).__init__(m_size, normalize, use_recurrent)
a_size = brain.action_space_size
hidden_state, hidden_visual, hidden = None, None, None
if brain.number_observations > 0:

bw = brain.camera_resolutions[i]['blackAndWhite']
visual_encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)[0])
visual_encoders.append(
self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)[0])
s_size = brain.state_space_size
s_size = brain.state_space_size * brain.stacked_states
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
hidden_state = \
self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
else:
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]

a_size = brain.action_space_size
self.policy = tf.layers.dense(hidden, a_size, activation=None, use_bias=False,
self.value = tf.layers.dense(hidden, 1, activation=None, use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=1.0))
self.value = tf.layers.dense(hidden, 1, activation=None)
self.value = tf.identity(self.value, name="value_estimate")
self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs + 1e-10), axis=1)

110
python/trainers/ppo_trainer.py


logger = logging.getLogger("unityagents")
def __init__(self, sess, env, brain_name, trainer_parameters, training):
"""
Responsible for collecting experiences and training PPO model.

:param training: Whether the trainer is set for training.
"""
self.param_keys = ['batch_size', 'beta','buffer_size','epsilon','gamma','hidden_units','lambd','learning_rate',
'max_steps','normalize','num_epoch','num_layers','time_horizon','sequence_length','summary_freq',
'use_recurrent','graph_scope','summary_path']
self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
'learning_rate',
'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length',
'summary_freq',
'use_recurrent', 'graph_scope', 'summary_path']
"brain {1}.".format(k, brain_name))
"brain {1}.".format(k, brain_name))
super(PPOTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training)

self.sequence_length = trainer_parameters["sequence_length"]
self.variable_scope = trainer_parameters['graph_scope']
with tf.variable_scope(self.variable_scope):
self.model = create_agent_model(env.brains[brain_name],
lr=float(trainer_parameters['learning_rate']),
h_size=int(trainer_parameters['hidden_units']),
epsilon=float(trainer_parameters['epsilon']),
beta=float(trainer_parameters['beta']),
max_step=float(trainer_parameters['max_steps']),
normalize=trainer_parameters['normalize'],
use_recurrent=trainer_parameters['use_recurrent'],
num_layers=int(trainer_parameters['num_layers']),
m_size = self.m_size)
self.model = create_agent_model(env.brains[brain_name],
lr=float(trainer_parameters['learning_rate']),
h_size=int(trainer_parameters['hidden_units']),
epsilon=float(trainer_parameters['epsilon']),
beta=float(trainer_parameters['beta']),
max_step=float(trainer_parameters['max_steps']),
normalize=trainer_parameters['normalize'],
use_recurrent=trainer_parameters['use_recurrent'],
num_layers=int(trainer_parameters['num_layers']),
m_size=self.m_size)
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}

if self.use_recurrent:
feed_dict[self.model.memory_in] = info.memories
run_list += [self.model.memory_out]
if (self.is_training and self.brain.state_space_type == "continuous" and
if (self.is_training and self.brain.state_space_type == "continuous" and
self.use_states and self.trainer_parameters['normalize']):
new_mean, new_variance = self.running_average(info.states, steps, self.model.running_mean,
self.model.running_variance)

#only ask for memories if use_recurrent
# only ask for memories if use_recurrent
if self.use_recurrent:
actions, a_dist, value, ent, learn_rate, memories, _, _ = self.sess.run(run_list, feed_dict=feed_dict)
else:

if self.use_recurrent:
actions, a_dist, value, ent, learn_rate, memories = self.sess.run(run_list, feed_dict=feed_dict)
else:
actions, a_dist, value, ent, learn_rate= self.sess.run(run_list, feed_dict=feed_dict)
actions, a_dist, value, ent, learn_rate = self.sess.run(run_list, feed_dict=feed_dict)
return (actions, memories, value, (actions, epsi, a_dist, value))
return actions, memories, value, (actions, epsi, a_dist, value)
def add_experiences(self, info, next_info, take_action_outputs):
"""

if not info.local_done[idx]:
if self.use_observations:
for i, _ in enumerate(info.observations):
self.training_buffer[agent_id]['observations%d'%i].append(info.observations[i][idx])
self.training_buffer[agent_id]['observations%d' % i].append(info.observations[i][idx])
if self.use_states:
self.training_buffer[agent_id]['states'].append(info.states[idx])
if self.use_recurrent:

info = info[self.brain_name]
for l in range(len(info.agents)):
if ((info.local_done[l] or
len(self.training_buffer[info.agents[l]]['actions']) > self.trainer_parameters['time_horizon'])
and len(self.training_buffer[info.agents[l]]['actions']) > 0):
agent_actions = self.training_buffer[info.agents[l]]['actions']
if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon'])
and len(agent_actions) > 0):
feed_dict = {self.model.batch_size: len(info.states), self.model.sequence_length :1}
if self.use_observations:
for i in range(self.info.observations):
feed_dict = {self.model.batch_size: len(info.states), self.model.sequence_length: 1}
if self.use_observations:
for i in range(info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states

get_gae(
rewards=self.training_buffer[agent_id]['rewards'].get_batch(),
value_estimates=self.training_buffer[agent_id]['value_estimates'].get_batch(),
value_next=value_next,
gamma=self.trainer_parameters['gamma'],
value_next=value_next,
gamma=self.trainer_parameters['gamma'],
self.training_buffer[agent_id]['advantages'].get_batch() \
+ self.training_buffer[agent_id]['value_estimates'].get_batch())
self.training_buffer[agent_id]['advantages'].get_batch() \
+ self.training_buffer[agent_id]['value_estimates'].get_batch())
self.training_buffer.append_update_buffer(agent_id,
batch_size = None, training_length=self.sequence_length)
self.training_buffer.append_update_buffer(agent_id,
batch_size=None, training_length=self.sequence_length)
self.training_buffer[agent_id].reset_agent()
if info.local_done[l]:

self.episode_steps[agent_id] = 0
def end_episode(self):
"""

def is_ready_update(self):
"""
Returns wether or not the trainer has enough elements to run update model
:return: A boolean corresponding to wether or not update_model() can be run
Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
return len(self.training_buffer.update_buffer['actions']) > self.trainer_parameters['buffer_size']

total_v, total_p = 0, 0
advantages = self.training_buffer.update_buffer['advantages'].get_batch()
self.training_buffer.update_buffer['advantages'].set(
(advantages - advantages.mean()) / advantages.std())
(advantages - advantages.mean()) / advantages.std())
feed_dict = {self.model.batch_size:batch_size,
self.model.sequence_length: self.sequence_length,
self.model.returns_holder: np.array(_buffer['discounted_returns'][start:end]).reshape([-1]),
self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1,1]),
self.model.old_probs: np.array(
_buffer['action_probs'][start:end]).reshape([-1,self.brain.action_space_size])}
feed_dict = {self.model.batch_size: batch_size,
self.model.sequence_length: self.sequence_length,
self.model.returns_holder: np.array(_buffer['discounted_returns'][start:end]).reshape(
[-1]),
self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1, 1]),
self.model.old_probs: np.array(
_buffer['action_probs'][start:end]).reshape([-1, self.brain.action_space_size])}
_buffer['epsilons'][start:end]).reshape([-1,self.brain.action_space_size])
_buffer['epsilons'][start:end]).reshape([-1, self.brain.action_space_size])
else:
feed_dict[self.model.action_holder] = np.array(
_buffer['actions'][start:end]).reshape([-1])

_buffer['states'][start:end]).reshape([-1,self.brain.state_space_size])
_buffer['states'][start:end]).reshape(
[-1, self.brain.state_space_size * self.brain.stacked_states])
_buffer['states'][start:end]).reshape([-1,1])
_buffer['states'][start:end]).reshape([-1, 1])
_obs = np.array(_buffer['observations%d'%i][start:end])
_obs = np.array(_buffer['observations%d' % i][start:end])
feed_dict[self.model.observation_in[i]] = _obs.reshape([-1,_w,_h,_c])
#Memories are zeros
feed_dict[self.model.observation_in[i]] = _obs.reshape([-1, _w, _h, _c])
# Memories are zeros
feed_dict[self.model.memory_in] = np.zeros([batch_size , self.m_size])
feed_dict[self.model.memory_in] = np.zeros([batch_size, self.m_size])
v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
self.model.update_batch], feed_dict=feed_dict)
total_v += v_loss

steps = self.get_step
if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
logger.info(" {0}: Step: {1}. Mean Reward: {2}. Std of Reward: {3}."
.format(self.brain_name, steps, mean_reward, np.std(self.stats['cumulative_reward'])))
logger.info(" {}: Step: {}. Mean Reward: {:0.3f}. Std of Reward: {:0.3f}."
.format(self.brain_name, steps, mean_reward, np.std(self.stats['cumulative_reward'])))
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:

"""
value_estimates = np.asarray(value_estimates.tolist() + [value_next])
delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
advantage = discount_rewards(r=delta_t, gamma=gamma*lambd)
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)

13
python/trainers/trainer.py


logger = logging.getLogger("unityagents")
class UnityTrainerException(UnityException):
"""
Related to errors with the Trainer.

class Trainer(object):
"""This class is the abstract class for the trainers"""
Responsible for collecting experiences and training PPO model.
Responsible for collecting experiences and training a neural network model.
:param sess: Tensorflow session.
:param env: The UnityEnvironment.
:param trainer_parameters: The parameters for the trainer (dictionary).

self.trainer_parameters = trainer_parameters
self.is_training = training
self.sess = sess
def __str__(self):
return '''Empty Trainer'''

"""
raise UnityTrainerException("The process_experiences method was not implemented.")
def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.

"""
try:
s_op = tf.summary.text(key,
tf.convert_to_tensor(([[str(x), str(input_dict[x])] for x in input_dict]))
)
tf.convert_to_tensor(([[str(x), str(input_dict[x])] for x in input_dict]))
)

13
python/unityagents/brain.py


"""
self.brain_name = brain_name
self.state_space_size = brain_param["stateSize"]
self.stacked_states = brain_param["stackedStates"]
self.number_observations = len(brain_param["cameraResolutions"])
self.camera_resolutions = brain_param["cameraResolutions"]
self.action_space_size = brain_param["actionSize"]

Number of observations (per agent): {1}
State space type: {2}
State space size (per agent): {3}
Action space type: {4}
Action space size (per agent): {5}
Memory space size (per agent): {6}
Action descriptions: {7}'''.format(self.brain_name,
Number of stacked states: {4}
Action space type: {5}
Action space size (per agent): {6}
Memory space size (per agent): {7}
Action descriptions: {8}'''.format(self.brain_name,
str(self.state_space_size), self.action_space_type,
str(self.state_space_size), str(self.stacked_states),
self.action_space_type,
str(self.action_space_size),
str(self.memory_space_size),
', '.join(self.action_descriptions))

6
python/unityagents/environment.py


n_agent = len(state_dict["agents"])
try:
if self._brains[b].state_space_type == "continuous":
states = np.array(state_dict["states"]).reshape((n_agent, self._brains[b].state_space_size))
states = np.array(state_dict["states"]).reshape((n_agent, self._brains[b].state_space_size * self._brains[b].stacked_states))
states = np.array(state_dict["states"]).reshape((n_agent, 1))
states = np.array(state_dict["states"]).reshape((n_agent, self._brains[b].stacked_states))
else str(self._brains[b].state_space_size * n_agent),
else str(self._brains[b].state_space_size * n_agent * self._brains[b].stacked_states),
self._brains[b].state_space_type,
len(state_dict["states"])))
memories = np.array(state_dict["memories"]).reshape((n_agent, self._brains[b].memory_space_size))

54
unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


return state;
}
// to be implemented by the developer
float action_z = act[0];
if (action_z > 2f)
{
action_z = 2f;
}
if (action_z < -2f)
{
action_z = -2f;
}
float action_z = 2f * Mathf.Clamp(act[0], -1f, 1f);
float action_x = act[1];
if (action_x > 2f)
{
action_x = 2f;
}
if (action_x < -2f)
{
action_x = -2f;
}
float action_x = 2f * Mathf.Clamp(act[1], -1f, 1f);
if (done == false)
{
reward = 0.1f;
}
}
else
{
int action = (int)act[0];
if (action == 0 || action == 1)
{
action = (action * 2) - 1;
float changeValue = action * 2f;
if ((gameObject.transform.rotation.z < 0.25f && changeValue > 0f) ||
(gameObject.transform.rotation.z > -0.25f && changeValue < 0f))
{
gameObject.transform.Rotate(new Vector3(0, 0, 1), changeValue);
}
}
if (action == 2 || action == 3)
{
action = ((action - 2) * 2) - 1;
float changeValue = action * 2f;
if ((gameObject.transform.rotation.x < 0.25f && changeValue > 0f) ||
(gameObject.transform.rotation.x > -0.25f && changeValue < 0f))
{
gameObject.transform.Rotate(new Vector3(1, 0, 0), changeValue);
}
}
if (done == false)
if (!done)
{
reward = 0.1f;
}

}
// to be implemented by the developer
public override void AgentReset()
{
gameObject.transform.rotation = new Quaternion(0f, 0f, 0f, 0f);