浏览代码

On Demand Decision (#308)

* On Demand Decision : Use RequestDecision and RequestAction 
 * New Agent Inspector : Use it to set On Demand Decision
 * New BrainParameters interface
 * LSTM memory size is now set in python
 * New C# API
 * Semantic Changes
 * Replaced RunMDP
 * New Bouncer Environment to test On Demand Dscision
/develop-generalizationTraining-TrainerController
GitHub 7 年前
当前提交
f134016b
共有 114 个文件被更改,包括 9966 次插入3654 次删除
  1. 10
      python/Basics.ipynb
  2. 24
      python/tests/test_bc.py
  3. 24
      python/tests/test_ppo.py
  4. 74
      python/tests/test_unityagents.py
  5. 15
      python/tests/test_unitytrainers.py
  6. 2
      python/trainer_config.yaml
  7. 53
      python/unityagents/brain.py
  8. 148
      python/unityagents/environment.py
  9. 2
      python/unitytrainers/bc/models.py
  10. 97
      python/unitytrainers/bc/trainer.py
  11. 11
      python/unitytrainers/buffer.py
  12. 29
      python/unitytrainers/models.py
  13. 2
      python/unitytrainers/ppo/models.py
  14. 129
      python/unitytrainers/ppo/trainer.py
  15. 14
      python/unitytrainers/trainer_controller.py
  16. 59
      unity-environment/Assets/ML-Agents/Editor/BrainEditor.cs
  17. 272
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DHardScene.unity
  18. 845
      unity-environment/Assets/ML-Agents/Examples/3DBall/3DScene.unity
  19. 24
      unity-environment/Assets/ML-Agents/Examples/3DBall/Prefabs/Game.prefab
  20. 34
      unity-environment/Assets/ML-Agents/Examples/3DBall/Prefabs/GameHard.prefab
  21. 45
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  22. 8
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DDecision.cs
  23. 25
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs
  24. 5
      unity-environment/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DHardAgent.cs.meta
  25. 6
      unity-environment/Assets/ML-Agents/Examples/Area.meta
  26. 28
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/PushArea.prefab
  27. 30
      unity-environment/Assets/ML-Agents/Examples/Area/Prefabs/WallArea.prefab
  28. 239
      unity-environment/Assets/ML-Agents/Examples/Area/Push.unity
  29. 25
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/AreaAgent.cs
  30. 7
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/AreaDecision.cs
  31. 4
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/GoalInteract.cs
  32. 45
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Push/PushAgent.cs
  33. 44
      unity-environment/Assets/ML-Agents/Examples/Area/Scripts/Wall/WallAgent.cs
  34. 208
      unity-environment/Assets/ML-Agents/Examples/Area/Wall.unity
  35. 7
      unity-environment/Assets/ML-Agents/Examples/Banana.meta
  36. 393
      unity-environment/Assets/ML-Agents/Examples/Banana/BananaImitation.unity
  37. 540
      unity-environment/Assets/ML-Agents/Examples/Banana/BananaRL.unity
  38. 56
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/Agent 1.prefab
  39. 161
      unity-environment/Assets/ML-Agents/Examples/Banana/Prefabs/AreaPB.prefab
  40. 33
      unity-environment/Assets/ML-Agents/Examples/Banana/Scripts/BananaAgent.cs
  41. 6
      unity-environment/Assets/ML-Agents/Examples/Basic.meta
  42. 203
      unity-environment/Assets/ML-Agents/Examples/Basic/Scene.unity
  43. 17
      unity-environment/Assets/ML-Agents/Examples/Basic/Scripts/BasicAgent.cs
  44. 6
      unity-environment/Assets/ML-Agents/Examples/Basic/Scripts/BasicDecision.cs
  45. 22
      unity-environment/Assets/ML-Agents/Examples/Basic/TFModels/Basic.bytes
  46. 4
      unity-environment/Assets/ML-Agents/Examples/Crawler.meta
  47. 168
      unity-environment/Assets/ML-Agents/Examples/Crawler/Crawler.unity
  48. 37
      unity-environment/Assets/ML-Agents/Examples/Crawler/Resources/Crawler.prefab
  49. 67
      unity-environment/Assets/ML-Agents/Examples/Crawler/Scripts/CrawlerAgentConfigurable.cs
  50. 6
      unity-environment/Assets/ML-Agents/Examples/GridWorld.meta
  51. 279
      unity-environment/Assets/ML-Agents/Examples/GridWorld/GridWorld.unity
  52. 2
      unity-environment/Assets/ML-Agents/Examples/GridWorld/Resources/agent.prefab
  53. 18
      unity-environment/Assets/ML-Agents/Examples/GridWorld/Scripts/GridAgent.cs
  54. 32
      unity-environment/Assets/ML-Agents/Examples/Hallway/Prefabs/HallwayArea.prefab
  55. 27
      unity-environment/Assets/ML-Agents/Examples/Hallway/Scripts/HallwayAgent.cs
  56. 6
      unity-environment/Assets/ML-Agents/Examples/Reacher.meta
  57. 30
      unity-environment/Assets/ML-Agents/Examples/Reacher/Prefabs/Agent.prefab
  58. 154
      unity-environment/Assets/ML-Agents/Examples/Reacher/Scene.unity
  59. 57
      unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherAgent.cs
  60. 6
      unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherDecision.cs
  61. 2
      unity-environment/Assets/ML-Agents/Examples/Reacher/Scripts/ReacherGoal.cs
  62. 4
      unity-environment/Assets/ML-Agents/Examples/Tennis.meta
  63. 62
      unity-environment/Assets/ML-Agents/Examples/Tennis/Prefabs/TennisArea.prefab
  64. 21
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/TennisAgent.cs
  65. 248
      unity-environment/Assets/ML-Agents/Examples/Tennis/Scripts/hitWall.cs
  66. 301
      unity-environment/Assets/ML-Agents/Examples/Tennis/Tennis.unity
  67. 304
      unity-environment/Assets/ML-Agents/Scripts/Academy.cs
  68. 727
      unity-environment/Assets/ML-Agents/Scripts/Agent.cs
  69. 454
      unity-environment/Assets/ML-Agents/Scripts/Brain.cs
  70. 7
      unity-environment/Assets/ML-Agents/Scripts/Communicator.cs
  71. 7
      unity-environment/Assets/ML-Agents/Scripts/CoreBrain.cs
  72. 25
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainExternal.cs
  73. 63
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainHeuristic.cs
  74. 216
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainInternal.cs
  75. 93
      unity-environment/Assets/ML-Agents/Scripts/CoreBrainPlayer.cs
  76. 4
      unity-environment/Assets/ML-Agents/Scripts/Decision.cs
  77. 265
      unity-environment/Assets/ML-Agents/Scripts/ExternalCommunicator.cs
  78. 5
      unity-environment/Assets/ML-Agents/Scripts/Monitor.cs
  79. 6
      unity-environment/Assets/ML-Agents/Template.meta
  80. 6
      unity-environment/Assets/ML-Agents/Template/Scripts/TemplateAgent.cs
  81. 6
      unity-environment/Assets/ML-Agents/Template/Scripts/TemplateDecision.cs
  82. 68
      unity-environment/Assets/ML-Agents/Editor/AgentEditor.cs
  83. 12
      unity-environment/Assets/ML-Agents/Editor/AgentEditor.cs.meta
  84. 745
      unity-environment/Assets/ML-Agents/Editor/MLAgentsEditModeTest.cs
  85. 12
      unity-environment/Assets/ML-Agents/Editor/MLAgentsEditModeTest.cs.meta
  86. 139
      unity-environment/Assets/ML-Agents/Editor/ResetParameterDrawer.cs
  87. 12
      unity-environment/Assets/ML-Agents/Editor/ResetParameterDrawer.cs.meta
  88. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/ball-EBS_ppo.bytes
  89. 785
      unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/ball.bytes
  90. 8
      unity-environment/Assets/ML-Agents/Examples/3DBall/TFModels/ball.bytes.meta
  91. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/ball-EBS-hard_ppo.bytes
  92. 1001
      unity-environment/Assets/ML-Agents/Examples/3DBall/ball-EBS_ppo.bytes
  93. 10
      unity-environment/Assets/ML-Agents/Examples/Bouncer.meta
  94. 52
      unity-environment/Assets/ML-Agents/Scripts/ResetParameters.cs
  95. 12
      unity-environment/Assets/ML-Agents/Scripts/ResetParameters.cs.meta
  96. 993
      unity-environment/Assets/ML-Agents/Examples/Bouncer/Bouncer.unity
  97. 9
      unity-environment/Assets/ML-Agents/Examples/Bouncer/Bouncer.unity.meta
  98. 10
      unity-environment/Assets/ML-Agents/Examples/Bouncer/Prefabs.meta

10
python/Basics.ipynb


"env_info = env.reset(train_mode=train_mode)[default_brain]\n",
"\n",
"# Examine the state space for the default brain\n",
"print(\"Agent state looks like: \\n{}\".format(env_info.states[0]))\n",
"print(\"Agent state looks like: \\n{}\".format(env_info.vector_observations[0]))\n",
"for observation in env_info.observations:\n",
"for observation in env_info.visual_observations:\n",
" print(\"Agent observations look like:\")\n",
" if observation.shape[3] == 3:\n",
" plt.imshow(observation[0,:,:,:])\n",

" done = False\n",
" episode_rewards = 0\n",
" while not done:\n",
" if brain.action_space_type == 'continuous':\n",
" if brain.vector_action_space_type == 'continuous':\n",
" brain.action_space_size))[default_brain]\n",
" brain.vector_action_space_size))[default_brain]\n",
" env_info = env.step(np.random.randint(0, brain.action_space_size, \n",
" env_info = env.step(np.random.randint(0, brain.vector_action_space_size, \n",
" size=(len(env_info.agents))))[default_brain]\n",
" episode_rewards += env_info.rewards[0]\n",
" done = env_info.local_done[0]\n",

24
python/tests/test_bc.py


"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates": 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations": 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 1,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 1,
"vectorObservationSpaceType": 1
}]
}'''.encode()

"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates": 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations": 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 0,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 0,
"vectorObservationSpaceType": 1
}]
}'''.encode()

24
python/tests/test_ppo.py


"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates": 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations": 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 1,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 1,
"vectorObservationSpaceType": 1
}]
}'''.encode()

"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates": 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations": 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 0,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 0,
"vectorObservationSpaceType": 1
}]
}'''.encode()

74
python/tests/test_unityagents.py


"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates" : 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations": 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 1,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 1,
"vectorObservationSpaceType": 1
}]
}'''.encode()

{
"brain_name": "RealFakeBrain",
"agents": [1,2],
"states": [1,2,3,4,5,6,1,2,3,4,5,6],
"vectorObservations": [1,2,3,4,5,6,1,2,3,4,5,6],
"actions": [1,2,3,4],
"vectorActions": [1,2,3,4],
"maxes": [false, false]
"maxes": [false, false],
"textObservations" :[" "," "]
'False'.encode()]
append_length('END_OF_MESSAGE:False')]
dummy_step = ['actions'.encode(),
append_length('''

"states": [1,2,3,4,5,6,7,8,9,1,2,3,4,5,6,7,8,9],
"vectorObservations": [1,2,3,4,5,6,7,8,9,1,2,3,4,5,6,7,8,9],
"actions": [1,2,3,4,5,6],
"vectorActions": [1,2,3,4,5,6],
"maxes": [false, false, false]
"maxes": [false, false, false],
"textObservations" :[" "," ", " "]
'False'.encode(),
append_length('END_OF_MESSAGE:False'),
"states": [1,2,3,4,5,6,7,8,9,1,2,3,4,5,6,7,8,9],
"vectorObservations": [1,2,3,4,5,6,7,8,9,1,2,3,4,5,6,7,8,9],
"actions": [1,2,3,4,5,6],
"vectorActions": [1,2,3,4,5,6],
"maxes": [false, false, false]
"maxes": [false, false, false],
"textObservations" :[" "," ", " "]
'True'.encode()]
append_length('END_OF_MESSAGE:True')]
dummy_curriculum = json.loads('''{
"measure" : "reward",

assert not env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size * brain.stacked_states
assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray)
assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations
assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \
len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \
brain.vector_observation_space_size * brain.num_stacked_vector_observations
def test_step():

mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
mock_socket.recv.side_effect = dummy_step
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))
env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents))
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size * brain.stacked_states
assert isinstance(brain_info['RealFakeBrain'].visual_observations, list)
assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray)
assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations
assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \
len(brain_info['RealFakeBrain'].agents)
assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \
brain.vector_observation_space_size * brain.num_stacked_vector_observations
assert not brain_info['RealFakeBrain'].local_done[0]
assert brain_info['RealFakeBrain'].local_done[2]

assert curriculum.get_config(0) == {"param1": 0.7, "param2": 100, "param3": 0.2}
assert curriculum.lesson_length == 0
assert curriculum.get_lesson_number == 2
if __name__ == '__main__':
pytest.main()

15
python/tests/test_unitytrainers.py


"logPath":"RealFakePath",
"apiNumber":"API-2",
"brainParameters": [{
"stateSize": 3,
"stackedStates" : 2,
"actionSize": 2,
"vectorObservationSize": 3,
"numStackedVectorObservations" : 2,
"vectorActionSize": 2,
"actionDescriptions": ["",""],
"actionSpaceType": 1,
"stateSpaceType": 1
"vectorActionDescriptions": ["",""],
"vectorActionSpaceType": 1,
"vectorObservationSpaceType": 1
}]
}'''.encode()

sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
''')
dummy_bc_config = yaml.load('''

sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
''')
dummy_bad_config = yaml.load('''

sequence_length: 64
summary_freq: 1000
use_recurrent: false
memory_size: 8
''')

2
python/trainer_config.yaml


lambd: 0.95
learning_rate: 3.0e-4
max_steps: 5.0e4
memory_size: 256
normalize: false
num_epoch: 5
num_layers: 2

num_epoch: 3
beta: 5.0e-4
hidden_units: 64
use_recurrent: true
sequence_length: 8
time_horizon: 8

53
python/unityagents/brain.py


class BrainInfo:
def __init__(self, observation, state, memory=None, reward=None, agents=None, local_done=None,
action=None, max_reached=None):
def __init__(self, visual_observation, vector_observation, text_observations, memory=None,
reward=None, agents=None, local_done=None,
action=None, max_reached=None):
self.observations = observation
self.states = state
self.visual_observations = visual_observation
self.vector_observations = vector_observation
self.text_observations = text_observations
self.memories = memory
self.rewards = reward
self.local_done = local_done

:param brain_param: Dictionary of brain parameters.
"""
self.brain_name = brain_name
self.state_space_size = brain_param["stateSize"]
self.stacked_states = brain_param["stackedStates"]
self.number_observations = len(brain_param["cameraResolutions"])
self.vector_observation_space_size = brain_param["vectorObservationSize"]
self.num_stacked_vector_observations = brain_param["numStackedVectorObservations"]
self.number_visual_observations = len(brain_param["cameraResolutions"])
self.action_space_size = brain_param["actionSize"]
self.memory_space_size = brain_param["memorySize"]
self.action_descriptions = brain_param["actionDescriptions"]
self.action_space_type = ["discrete", "continuous"][brain_param["actionSpaceType"]]
self.state_space_type = ["discrete", "continuous"][brain_param["stateSpaceType"]]
self.vector_action_space_size = brain_param["vectorActionSize"]
self.vector_action_descriptions = brain_param["vectorActionDescriptions"]
self.vector_action_space_type = ["discrete", "continuous"][brain_param["vectorActionSpaceType"]]
self.vector_observation_space_type = ["discrete", "continuous"][brain_param["vectorObservationSpaceType"]]
Number of observations (per agent): {1}
State space type: {2}
State space size (per agent): {3}
Number of stacked states: {4}
Action space type: {5}
Action space size (per agent): {6}
Memory space size (per agent): {7}
Action descriptions: {8}'''.format(self.brain_name,
str(self.number_observations), self.state_space_type,
str(self.state_space_size), str(self.stacked_states),
self.action_space_type,
str(self.action_space_size),
str(self.memory_space_size),
', '.join(self.action_descriptions))
Number of Visual Observations (per agent): {1}
Vector Observation space type: {2}
Vector Observation space size (per agent): {3}
Number of stacked Vector Observation: {4}
Vector Action space type: {5}
Vector Action space size (per agent): {6}
Vector Action descriptions: {7}'''.format(self.brain_name,
str(self.number_visual_observations),
self.vector_observation_space_type,
str(self.vector_observation_space_size),
str(self.num_stacked_vector_observations),
self.vector_action_space_type,
str(self.vector_action_space_size),
', '.join(self.vector_action_descriptions))

148
python/unityagents/environment.py


self._global_done = None
self._academy_name = p["AcademyName"]
self._log_path = p["logPath"]
self._brains = AllBrainInfo()
# Need to instantiate new AllBrainInfo
self._brains = {}
self._brain_names = p["brainNames"]
self._external_brain_names = p["externalBrainNames"]
self._external_brain_names = [] if self._external_brain_names is None else self._external_brain_names

:return:
"""
state = self._recv_bytes().decode('utf-8')
if state[:14] == "END_OF_MESSAGE":
return {}, state[15:] == 'True'
return state_dict
return state_dict, None
def reset(self, train_mode=True, config=None, lesson=None) -> AllBrainInfo:
"""

Collects experience information from all external brains in environment at current step.
:return: a dictionary of BrainInfo objects.
"""
self._data = AllBrainInfo()
for index in range(self._num_brains):
state_dict = self._get_state_dict()
self._data = {}
while True:
state_dict, end_of_message = self._get_state_dict()
if end_of_message is not None:
self._global_done = end_of_message
for _b in self._brain_names:
if _b not in self._data:
self._data[_b] = BrainInfo([], np.array([]), [], np.array([]),
[], [], [], np.array([]), max_reached=[])
return self._data
if self._brains[b].state_space_type == "continuous":
states = np.array(state_dict["states"]).reshape((n_agent, self._brains[b].state_space_size * self._brains[b].stacked_states))
if self._brains[b].vector_observation_space_type == "continuous":
vector_obs = np.array(state_dict["vectorObservations"]).reshape(
(n_agent, self._brains[b].vector_observation_space_size
* self._brains[b].num_stacked_vector_observations))
states = np.array(state_dict["states"]).reshape((n_agent, self._brains[b].stacked_states))
vector_obs = np.array(state_dict["vectorObservations"]).reshape(
(n_agent, self._brains[b].num_stacked_vector_observations))
"Expecting {1} {2} state but received {3}."
.format(b, n_agent if self._brains[b].state_space_type == "discrete"
else str(self._brains[b].state_space_size * n_agent * self._brains[b].stacked_states),
self._brains[b].state_space_type,
"Expecting {1} {2} state but received {3}."
.format(b, n_agent if self._brains[b].vector_observation_space_type == "discrete"
else str(self._brains[b].vector_observation_space_size * n_agent
* self._brains[b].num_stacked_vector_observations),
self._brains[b].vector_observation_space_type,
memories = np.array(state_dict["memories"]).reshape((n_agent, self._brains[b].memory_space_size))
memories = np.array(state_dict["memories"]).reshape((n_agent, -1))
text_obs = state_dict["textObservations"]
actions = np.array(state_dict["actions"]).reshape((n_agent, -1))
vector_actions = np.array(state_dict["vectorActions"]).reshape((n_agent, -1))
actions = np.array([])
vector_actions = np.array([])
for o in range(self._brains[b].number_observations):
for o in range(self._brains[b].number_visual_observations):
self._data[b] = BrainInfo(observations, vector_obs, text_obs, memories, rewards,
agents, dones, vector_actions, max_reached=maxes)
self._data[b] = BrainInfo(observations, states, memories, rewards, agents,
dones, actions, max_reached=maxes)
try:
self._global_done = self._conn.recv(self._buffer_size).decode('utf-8') == 'True'
except socket.timeout as e:
raise UnityTimeOutException("The environment took too long to respond.", self._log_path)
return self._data
def _send_action(self, action, memory, value):
def _send_action(self, vector_action ,memory, text_action):
:param action: a dictionary of lists of actions.
:param vector_action: a dictionary of lists of vector actions.
:param value: a dictionary of lists of of value estimates.
:param text_action: a dictionary of lists of text actions.
action_message = {"action": action, "memory": memory, "value": value}
action_message = {"vector_action": vector_action, "memory": memory, "text_action": text_action}
self._conn.send(self._append_length(json.dumps(action_message).encode('utf-8')))
@staticmethod

arr = [float(x) for x in arr]
return arr
def step(self, action=None, memory=None, value=None) -> AllBrainInfo:
def step(self, vector_action=None, memory=None, text_action=None) -> AllBrainInfo:
:param action: Agent's action to send to environment. Can be a scalar or vector of int/floats.
:param vector_action: Agent's vector action to send to environment. Can be a scalar or vector of int/floats.
:param value: Value estimate to send to environment for visualization. Can be a scalar or vector of float(s).
:param text_action: Text action to send to environment for.
action = {} if action is None else action
vector_action = {} if vector_action is None else vector_action
value = {} if value is None else value
text_action = {} if text_action is None else text_action
if isinstance(action, (int, np.int_, float, np.float_, list, np.ndarray)):
if isinstance(vector_action, (int, np.int_, float, np.float_, list, np.ndarray)):
action = {self._external_brain_names[0]: action}
vector_action = {self._external_brain_names[0]: vector_action}
"and actions as values".format(self._num_brains))
"and vector_actions as values".format(self._num_brains))
"step cannot take an action input")
"step cannot take a vector_action input")
if isinstance(memory, (int, np.int_, float, np.float_, list, np.ndarray)):
if self._num_external_brains == 1:

raise UnityActionException(
"There are no external brains in the environment, "
"step cannot take a memory input")
if isinstance(value, (int, np.int_, float, np.float_, list, np.ndarray)):
if isinstance(text_action, (str, list, np.ndarray)):
value = {self._external_brain_names[0]: value}
text_action = {self._external_brain_names[0]: text_action}
"and state/action value estimates as values".format(self._num_brains))
"and text_actions as values".format(self._num_brains))
for brain_name in list(action.keys()) + list(memory.keys()) + list(value.keys()):
for brain_name in list(vector_action.keys()) + list(memory.keys()) + list(text_action.keys()):
if brain_name not in self._external_brain_names:
raise UnityActionException(
"The name {0} does not correspond to an external brain "

n_agent = len(self._data[b].agents)
if b not in action:
raise UnityActionException("You need to input an action for the brain {0}".format(b))
action[b] = self._flatten(action[b])
if b not in vector_action:
# raise UnityActionException("You need to input an action for the brain {0}".format(b))
if self._brains[b].vector_action_space_type == "discrete":
vector_action[b] = [0.0] * n_agent
else:
vector_action[b] = [0.0] * n_agent * self._brains[b].vector_action_space_size
else:
vector_action[b] = self._flatten(vector_action[b])
memory[b] = [0.0] * self._brains[b].memory_space_size * n_agent
memory[b] = []
memory[b] = [0.0] * self._brains[b].memory_space_size * n_agent
memory[b] = []
if b not in value:
value[b] = [0.0] * n_agent
if b not in text_action:
text_action[b] = [""] * n_agent
if value[b] is None:
value[b] = [0.0] * n_agent
if text_action[b] is None:
text_action[b] = []
value[b] = self._flatten(value[b])
if not (len(value[b]) == n_agent):
text_action[b] = [""] * n_agent
if not ((len(text_action[b]) == n_agent) or len(text_action[b]) == 0):
"There was a mismatch between the provided value and environment's expectation: "
"The brain {0} expected {1} value but was given {2}".format(b, n_agent, len(value[b])))
if not (len(memory[b]) == self._brains[b].memory_space_size * n_agent):
raise UnityActionException(
"There was a mismatch between the provided memory and environment's expectation: "
"The brain {0} expected {1} memories but was given {2}"
.format(b, self._brains[b].memory_space_size * n_agent, len(memory[b])))
if not ((self._brains[b].action_space_type == "discrete" and len(action[b]) == n_agent) or
(self._brains[b].action_space_type == "continuous" and len(
action[b]) == self._brains[b].action_space_size * n_agent)):
"There was a mismatch between the provided text_action and environment's expectation: "
"The brain {0} expected {1} text_action but was given {2}".format(
b, n_agent, len(text_action[b])))
if not ((self._brains[b].vector_action_space_type == "discrete" and len(vector_action[b]) == n_agent) or
(self._brains[b].vector_action_space_type == "continuous" and len(
vector_action[b]) == self._brains[b].vector_action_space_size * n_agent)):
.format(b, n_agent if self._brains[b].action_space_type == "discrete" else
str(self._brains[b].action_space_size * n_agent), self._brains[b].action_space_type,
str(action[b])))
.format(b, n_agent if self._brains[b].vector_action_space_type == "discrete" else
str(self._brains[b].vector_action_space_size * n_agent),
self._brains[b].vector_action_space_type,
str(vector_action[b])))
self._send_action(action, memory, value)
self._send_action(vector_action, memory, text_action)
return self._get_state()
elif not self._loaded:
raise UnityEnvironmentException("No Unity environment is loaded.")

2
python/unitytrainers/bc/models.py


self.policy = tf.layers.dense(hidden_reg, self.a_size, activation=None, use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01))
if brain.action_space_type == "discrete":
if brain.vector_action_space_type == "discrete":
self.action_probs = tf.nn.softmax(self.policy)
self.sample_action = tf.cast(tf.multinomial(self.policy, 1, name="action"), tf.int32)
self.true_action = tf.placeholder(shape=[None], dtype=tf.int32, name="expert_action")

97
python/unitytrainers/bc/trainer.py


"""
self.param_keys = ['brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope',
'summary_freq', 'max_steps', 'batches_per_epoch', 'use_recurrent', 'hidden_units',
'num_layers', 'sequence_length']
'num_layers', 'sequence_length', 'memory_size']
for k in self.param_keys:
if k not in trainer_parameters:

self.variable_scope = trainer_parameters['graph_scope']
self.brain_to_imitate = trainer_parameters['brain_to_imitate']
self.batch_size = trainer_parameters['batch_size']
self.batches_per_epoch = trainer_parameters['batches_per_epoch']
self.use_recurrent = trainer_parameters['use_recurrent']
self.step = 0

self.m_size = env.brains[brain_name].memory_space_size
self.m_size = trainer_parameters["memory_size"]
self.n_sequences = max(int(trainer_parameters['batch_size'] / self.sequence_length), 1)
self.is_continuous = (env.brains[brain_name].action_space_type == "continuous")
self.use_observations = (env.brains[brain_name].number_observations > 0)
self.is_continuous = (env.brains[brain_name].vector_action_space_type == "continuous")
self.use_observations = (env.brains[brain_name].number_visual_observations > 0)
self.use_states = (env.brains[brain_name].state_space_size > 0)
self.use_states = (env.brains[brain_name].vector_observation_space_size > 0)
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)

h_size=int(trainer_parameters['hidden_units']),
lr=float(trainer_parameters['learning_rate']),
n_layers=int(trainer_parameters['num_layers']),
m_size=self.brain.memory_space_size,
m_size=self.m_size,
normalize=False,
use_recurrent=trainer_parameters['use_recurrent'],
brain=self.brain)

:return: a tuple containing action, memories, values and an object
to be passed to add experiences
"""
if len(all_brain_info[self.brain_name].agents) == 0:
return [], [], [], None
for i, _ in enumerate(agent_brain.observations):
feed_dict[self.model.observation_in[i]] = agent_brain.observations[i]
for i, _ in enumerate(agent_brain.visual_observations):
feed_dict[self.model.observation_in[i]] = agent_brain.visual_observations[i]
feed_dict[self.model.state_in] = agent_brain.states
feed_dict[self.model.state_in] = agent_brain.vector_observations
if agent_brain.memories.shape[1] == 0:
agent_brain.memories = np.zeros((len(agent_brain.agents), self.m_size))
feed_dict[self.model.memory_in] = agent_brain.memories
run_list += [self.model.memory_out]
if self.use_recurrent:

info_expert = curr_info[self.brain_to_imitate]
next_info_expert = next_info[self.brain_to_imitate]
for agent_id in info_expert.agents:
if agent_id in next_info_expert.agents:
idx = info_expert.agents.index(agent_id)
idx = info_expert.agents.index(agent_id)
self.training_buffer[agent_id].last_brain_info = info_expert
self.training_buffer[agent_id].last_take_action_outputs = take_action_outputs
info_expert = None
take_action_outputs = None
for agent_id in next_info_expert.agents:
stored_info_expert = self.training_buffer[agent_id].last_brain_info
if stored_info_expert is None:
continue
else:
idx = stored_info_expert.agents.index(agent_id)
if not info_expert.local_done[idx]:
if not stored_info_expert.local_done[idx]:
for i, _ in enumerate(curr_info.observations):
self.training_buffer[agent_id]['observations%d' % i].append(info_expert.observations[i][idx])
for i, _ in enumerate(stored_info_expert.visual_observations):
self.training_buffer[agent_id]['observations%d' % i].append(stored_info_expert.visual_observations[i][idx])
self.training_buffer[agent_id]['states'].append(info_expert.states[idx])
self.training_buffer[agent_id]['states'].append(stored_info_expert.vector_observations[idx])
self.training_buffer[agent_id]['memory'].append(info_expert.memories[idx])
if stored_info_expert.memories.shape[1] == 0:
stored_info_expert.memories = np.zeros((len(stored_info_expert.agents), self.m_size))
self.training_buffer[agent_id]['memory'].append(stored_info_expert.memories[idx])
info_student = next_info[self.brain_name]
info_student = curr_info[self.brain_name]
next_idx = next_info_student.agents.index(agent_id)
if not info_student.local_done[idx]:
if agent_id not in self.cumulative_rewards:
self.cumulative_rewards[agent_id] = 0
self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx]
if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0
self.episode_steps[agent_id] += 1
self.training_buffer[agent_id].last_brain_info = info_student
info_student = None
for agent_id in next_info_student.agents:
stored_info_student = self.training_buffer[agent_id].last_brain_info
if stored_info_student is None:
continue
else:
idx = stored_info_student.agents.index(agent_id)
next_idx = next_info_student.agents.index(agent_id)
if not stored_info_student.local_done[idx]:
if agent_id not in self.cumulative_rewards:
self.cumulative_rewards[agent_id] = 0
self.cumulative_rewards[agent_id] += next_info_student.rewards[next_idx]
if agent_id not in self.episode_steps:
self.episode_steps[agent_id] = 0
self.episode_steps[agent_id] += 1
def process_experiences(self, info: AllBrainInfo):
"""

Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
return len(self.training_buffer.update_buffer['actions']) > self.batch_size
return len(self.training_buffer.update_buffer['actions']) > self.n_sequences
batch_size = self.trainer_parameters['batch_size']
min(len(self.training_buffer.update_buffer['actions']) // self.batch_size, self.batches_per_epoch)):
min(len(self.training_buffer.update_buffer['actions']) // self.n_sequences, self.batches_per_epoch)):
start = j * batch_size
end = (j + 1) * batch_size
start = j * self.n_sequences
end = (j + 1) * self.n_sequences
feed_dict = {self.model.true_action: batch_actions.reshape([-1, self.brain.action_space_size]),
feed_dict = {self.model.true_action: batch_actions.reshape([-1, self.brain.vector_action_space_size]),
self.model.batch_size: batch_size,
self.model.batch_size: self.n_sequences,
feed_dict[self.model.state_in] = batch_states.reshape([-1, self.brain.state_space_size *
self.brain.stacked_states])
feed_dict[self.model.state_in] = batch_states.reshape([-1, self.brain.vector_observation_space_size *
self.brain.num_stacked_vector_observations])
if self.use_observations:
for i, _ in enumerate(self.model.observation_in):
_obs = np.array(_buffer['observations%d' % i][start:end])

feed_dict[self.model.memory_in] = np.zeros([batch_size, self.m_size])
feed_dict[self.model.memory_in] = np.zeros([self.n_sequences, self.m_size])
loss, _ = self.sess.run([self.model.loss, self.model.update], feed_dict=feed_dict)
batch_losses.append(loss)

11
python/unitytrainers/buffer.py


"""
self[:] = []
def __init__(self):
self.last_brain_info = None
self.last_take_action_outputs = None
super(Buffer.AgentBuffer, self).__init__()
def __str__(self):
return ", ".join(["'{0}' : {1}".format(k, str(self[k])) for k in self.keys()])

"""
for k in self.keys():
self[k].reset_field()
self.last_brain_info = None
self.last_take_action_outputs = None
def __getitem__(self, key):
if key not in self.keys():

def reset_all(self):
"""
Resets the update buffer and all the local local_buffers
"""
Resets all the local local_buffers
"""
agent_ids = list(self.keys())
for k in agent_ids:
self[k].reset_agent()

29
python/unitytrainers/models.py


self.m_size = m_size
self.normalize = normalize
self.use_recurrent = use_recurrent
self.a_size = brain.action_space_size
self.a_size = brain.vector_action_space_size
@staticmethod
def create_global_steps():

return tf.multiply(input_activation, tf.nn.sigmoid(input_activation))
@staticmethod
def create_visual_input(o_size_h, o_size_w, bw):
def create_visual_input(o_size_h, o_size_w, bw, name):
observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32)
observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name)
if self.brain.state_space_type == "continuous":
if self.brain.vector_observation_space_type == "continuous":
self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state')
if self.normalize:
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,

def create_new_obs(self, num_streams, h_size, num_layers):
brain = self.brain
s_size = brain.state_space_size * brain.stacked_states
if brain.action_space_type == "continuous":
s_size = brain.vector_observation_space_size * brain.num_stacked_vector_observations
if brain.vector_action_space_type == "continuous":
for i in range(brain.number_observations):
for i in range(brain.number_visual_observations):
visual_input = self.create_visual_input(height_size, width_size, bw)
visual_input = tf.identity(visual_input, name="observation_in_" + str(i))
visual_input = self.create_visual_input(height_size, width_size, bw, name="visual_observation_" + str(i))
self.observation_in.append(visual_input)
self.create_vector_input(s_size)

hidden_state, hidden_visual = None, None
if brain.number_observations > 0:
for j in range(brain.number_observations):
if brain.number_visual_observations > 0:
for j in range(brain.number_visual_observations):
if brain.state_space_size > 0:
s_size = brain.state_space_size * brain.stacked_states
if brain.state_space_type == "continuous":
if brain.vector_observation_space_size > 0:
s_size = brain.vector_observation_space_size * brain.num_stacked_vector_observations
if brain.vector_observation_space_type == "continuous":
hidden_state = self.create_continuous_state_encoder(h_size, activation_fn, num_layers)
else:
hidden_state = self.create_discrete_state_encoder(s_size, h_size,

hidden = hidden_streams[0]
if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.prev_action = tf.placeholder(shape=[None], dtype=tf.int32, name='prev_action')
self.prev_action_oh = c_layers.one_hot_encoding(self.prev_action, self.a_size)
hidden = tf.concat([hidden, self.prev_action_oh], axis=1)

hidden_streams = self.create_new_obs(num_streams, h_size, num_layers)
if self.use_recurrent:
tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32)
self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name='recurrent_in')
_half_point = int(self.m_size / 2)
hidden_policy, memory_policy_out = self.create_recurrent_encoder(

2
python/unitytrainers/ppo/models.py


if num_layers < 1:
num_layers = 1
self.last_reward, self.new_reward, self.update_reward = self.create_reward_encoder()
if brain.action_space_type == "continuous":
if brain.vector_action_space_type == "continuous":
self.create_cc_actor_critic(h_size, num_layers)
self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy
else:

129
python/unitytrainers/ppo/trainer.py


'learning_rate',
'max_steps', 'normalize', 'num_epoch', 'num_layers', 'time_horizon', 'sequence_length',
'summary_freq',
'use_recurrent', 'graph_scope', 'summary_path']
'use_recurrent', 'graph_scope', 'summary_path', 'memory_size']
for k in self.param_keys:
if k not in trainer_parameters:

self.sequence_length = 1
self.m_size = None
if self.use_recurrent:
self.m_size = env.brains[brain_name].memory_space_size
self.m_size = trainer_parameters["memory_size"]
self.sequence_length = trainer_parameters["sequence_length"]
if self.use_recurrent:
if self.m_size == 0:

self.training_buffer = Buffer()
self.cumulative_rewards = {}
self.episode_steps = {}
self.is_continuous = (env.brains[brain_name].action_space_type == "continuous")
self.use_observations = (env.brains[brain_name].number_observations > 0)
self.use_states = (env.brains[brain_name].state_space_size > 0)
self.is_continuous = (env.brains[brain_name].vector_action_space_type == "continuous")
self.use_observations = (env.brains[brain_name].number_visual_observations > 0)
self.use_states = (env.brains[brain_name].vector_observation_space_size > 0)
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):
os.makedirs(self.summary_path)

"""
steps = self.get_step
curr_brain_info = all_brain_info[self.brain_name]
feed_dict = {self.model.batch_size: len(curr_brain_info.states), self.model.sequence_length: 1}
if len(curr_brain_info.agents) == 0:
return [], [], [], None
feed_dict = {self.model.batch_size: len(curr_brain_info.vector_observations), self.model.sequence_length: 1}
run_list = [self.model.output, self.model.all_probs, self.model.value, self.model.entropy,
self.model.learning_rate]
if self.is_continuous:

if self.use_observations:
for i, _ in enumerate(curr_brain_info.observations):
feed_dict[self.model.observation_in[i]] = curr_brain_info.observations[i]
for i, _ in enumerate(curr_brain_info.visual_observations):
feed_dict[self.model.observation_in[i]] = curr_brain_info.visual_observations[i]
feed_dict[self.model.state_in] = curr_brain_info.states
feed_dict[self.model.state_in] = curr_brain_info.vector_observations
if curr_brain_info.memories.shape[1] == 0:
curr_brain_info.memories = np.zeros((len(curr_brain_info.agents), self.m_size))
if (self.is_training and self.brain.state_space_type == "continuous" and
if (self.is_training and self.brain.vector_observation_space_type == "continuous" and
new_mean, new_variance = self.running_average(curr_brain_info.states, steps, self.model.running_mean,
self.model.running_variance)
new_mean, new_variance = self.running_average(
curr_brain_info.vector_observations, steps, self.model.running_mean, self.model.running_variance)
feed_dict[self.model.new_mean] = new_mean
feed_dict[self.model.new_variance] = new_variance
run_list = run_list + [self.model.update_mean, self.model.update_variance]

self.stats['entropy'].append(run_out[self.model.entropy])
self.stats['learning_rate'].append(run_out[self.model.learning_rate])
if self.use_recurrent:
return run_out[self.model.output], run_out[self.model.memory_out], run_out[self.model.value], run_out
return (run_out[self.model.output],
run_out[self.model.memory_out],
[str(v) for v in run_out[self.model.value]],
run_out)
return run_out[self.model.output], None, run_out[self.model.value], run_out
return (run_out[self.model.output],
None,
[str(v) for v in run_out[self.model.value]],
run_out)
def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take_action_outputs):
def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainInfo, take_action_outputs):
:param curr_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_info: Dictionary of all current brains and corresponding BrainInfo.
:param curr_all_info: Dictionary of all current brains and corresponding BrainInfo.
:param next_all_info: Dictionary of all current brains and corresponding BrainInfo.
curr_info = curr_info[self.brain_name]
next_info = next_info[self.brain_name]
actions = take_action_outputs[self.model.output]
epsi = 0
if self.is_continuous:
epsi = take_action_outputs[self.model.epsilon]
a_dist = take_action_outputs[self.model.all_probs]
value = take_action_outputs[self.model.value]
curr_info = curr_all_info[self.brain_name]
next_info = next_all_info[self.brain_name]
if agent_id in next_info.agents:
idx = curr_info.agents.index(agent_id)
self.training_buffer[agent_id].last_brain_info = curr_info
self.training_buffer[agent_id].last_take_action_outputs = take_action_outputs
for agent_id in next_info.agents:
stored_info = self.training_buffer[agent_id].last_brain_info
stored_take_action_outputs = self.training_buffer[agent_id].last_take_action_outputs
if stored_info is None:
continue
else:
idx = stored_info.agents.index(agent_id)
if not curr_info.local_done[idx]:
if not stored_info.local_done[idx]:
for i, _ in enumerate(curr_info.observations):
self.training_buffer[agent_id]['observations%d' % i].append(curr_info.observations[i][idx])
for i, _ in enumerate(stored_info.visual_observations):
self.training_buffer[agent_id]['observations%d' % i].append(stored_info.visual_observations[i][idx])
self.training_buffer[agent_id]['states'].append(curr_info.states[idx])
self.training_buffer[agent_id]['states'].append(stored_info.vector_observations[idx])
self.training_buffer[agent_id]['memory'].append(curr_info.memories[idx])
if stored_info.memories.shape[1] == 0:
stored_info.memories = np.zeros((len(stored_info.agents), self.m_size))
self.training_buffer[agent_id]['memory'].append(stored_info.memories[idx])
epsi = stored_take_action_outputs[self.model.epsilon]
actions = stored_take_action_outputs[self.model.output]
a_dist = stored_take_action_outputs[self.model.all_probs]
value = stored_take_action_outputs[self.model.value]
self.training_buffer[agent_id]['prev_action'].append(info.previous_actions[idx])
self.training_buffer[agent_id]['prev_action'].append(stored_info.previous_actions[idx])
if agent_id not in self.cumulative_rewards:
self.cumulative_rewards[agent_id] = 0
self.cumulative_rewards[agent_id] += next_info.rewards[next_idx]

def process_experiences(self, info: AllBrainInfo):
def process_experiences(self, all_info: AllBrainInfo):
:param info: Dictionary of all current brains and corresponding BrainInfo.
:param all_info: Dictionary of all current brains and corresponding BrainInfo.
info = info[self.brain_name]
info = all_info[self.brain_name]
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]['actions']
if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon'])

else:
feed_dict = {self.model.batch_size: len(info.states), self.model.sequence_length: 1}
feed_dict = {self.model.batch_size: len(info.vector_observations), self.model.sequence_length: 1}
for i in range(len(info.observations)):
feed_dict[self.model.observation_in[i]] = info.observations[i]
for i in range(len(info.visual_observations)):
feed_dict[self.model.observation_in[i]] = info.visual_observations[i]
feed_dict[self.model.state_in] = info.states
feed_dict[self.model.state_in] = info.vector_observations
if info.memories.shape[1] == 0:
info.memories = np.zeros((len(info.vector_observations), self.m_size))
if not self.is_continuous:
if not self.is_continuous and self.use_recurrent:
feed_dict[self.model.prev_action] = np.reshape(info.previous_actions, [-1])
value_next = self.sess.run(self.model.value, feed_dict)[l]
agent_id = info.agents[l]

Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
return len(self.training_buffer.update_buffer['actions']) > self.trainer_parameters['buffer_size']
return len(self.training_buffer.update_buffer['actions']) > \
max(int(self.trainer_parameters['buffer_size'] / self.sequence_length), 1)
def update_model(self):
"""

batch_size = self.trainer_parameters['batch_size']
n_sequences = max(int(self.trainer_parameters['batch_size'] / self.sequence_length), 1)
total_v, total_p = 0, 0
advantages = self.training_buffer.update_buffer['advantages'].get_batch()
self.training_buffer.update_buffer['advantages'].set(

for l in range(len(self.training_buffer.update_buffer['actions']) // batch_size):
start = l * batch_size
end = (l + 1) * batch_size
for l in range(len(self.training_buffer.update_buffer['actions']) // n_sequences):
start = l * n_sequences
end = (l + 1) * n_sequences
feed_dict = {self.model.batch_size: batch_size,
feed_dict = {self.model.batch_size: n_sequences,
self.model.sequence_length: self.sequence_length,
self.model.mask_input: np.array(_buffer['masks'][start:end]).reshape(
[-1]),

self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1]),
self.model.all_old_probs: np.array(
_buffer['action_probs'][start:end]).reshape([-1, self.brain.action_space_size])}
_buffer['action_probs'][start:end]).reshape([-1, self.brain.vector_action_space_size])}
_buffer['epsilons'][start:end]).reshape([-1, self.brain.action_space_size])
_buffer['epsilons'][start:end]).reshape([-1, self.brain.vector_action_space_size])
else:
feed_dict[self.model.action_holder] = np.array(
_buffer['actions'][start:end]).reshape([-1])

if self.use_states:
if self.brain.state_space_type == "continuous":
if self.brain.vector_observation_space_type == "continuous":
[-1, self.brain.state_space_size * self.brain.stacked_states])
[-1, self.brain.vector_observation_space_size * self.brain.num_stacked_vector_observations])
_buffer['states'][start:end]).reshape([-1, self.brain.stacked_states])
_buffer['states'][start:end]).reshape([-1, self.brain.num_stacked_vector_observations])
if self.use_observations:
for i, _ in enumerate(self.model.observation_in):
_obs = np.array(_buffer['observations%d' % i][start:end])

14
python/unitytrainers/trainer_controller.py


tf.set_random_seed(self.seed)
self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id,
curriculum=self.curriculum_file, seed=self.seed)
self.logger.info(str(self.env))
self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment
def _get_progress(self):

elif not self.trainers[brain_name].parameters["use_recurrent"]:
nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
else:
node_list = ["action", "value_estimate", "action_probs", "recurrent_out"]
node_list = ["action", "value_estimate", "action_probs", "recurrent_out", "memory_size"]
nodes += [scope + x for x in node_list]
if len(scopes) > 1:
self.logger.info("List of available scopes :")

for brain_name, trainer in self.trainers.items():
trainer.end_episode()
# Decide and take an action
take_action_actions, take_action_memories, take_action_values, take_action_outputs = {}, {}, {}, {}
take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {}
(take_action_actions[brain_name],
(take_action_vector[brain_name],
take_action_values[brain_name],
take_action_text[brain_name],
new_info = self.env.step(action=take_action_actions, memory=take_action_memories,
value=take_action_values)
new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories,
text_action=take_action_text)
for brain_name, trainer in self.trainers.items():
trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name])
curr_info = new_info

59
unity-environment/Assets/ML-Agents/Editor/BrainEditor.cs


[CustomEditor (typeof(Brain))]
public class BrainEditor : Editor
{
[SerializeField]
bool _Foldout = true;
public override void OnInspectorGUI ()
{
Brain myBrain = (Brain)target;

}
BrainParameters parameters = myBrain.brainParameters;
if (parameters.actionDescriptions == null || parameters.actionDescriptions.Length != parameters.actionSize)
parameters.actionDescriptions = new string[parameters.actionSize];
if (parameters.vectorActionDescriptions == null || parameters.vectorActionDescriptions.Length != parameters.vectorActionSize)
parameters.vectorActionDescriptions = new string[parameters.vectorActionSize];
SerializedProperty bp = serializedBrain.FindProperty ("brainParameters");
EditorGUILayout.PropertyField(bp, true);
_Foldout = EditorGUILayout.Foldout(_Foldout, "Brain Parameters");
int indentLevel = EditorGUI.indentLevel;
if (_Foldout)
{
EditorGUI.indentLevel++;
EditorGUILayout.LabelField("Vector Observation");
EditorGUI.indentLevel++;
SerializedProperty bpVectorObsType = serializedBrain.FindProperty("brainParameters.vectorObservationSpaceType");
EditorGUILayout.PropertyField(bpVectorObsType, new GUIContent("Space Type", "Corresponds to whether state " +
"vector contains a single integer (Discrete) " +
"or a series of real-valued floats (Continuous)."));
SerializedProperty bpVectorObsSize = serializedBrain.FindProperty("brainParameters.vectorObservationSize");
EditorGUILayout.PropertyField(bpVectorObsSize, new GUIContent("Space Size", "Length of state " +
"vector for brain (In Continuous state space)." +
"Or number of possible values (in Discrete state space)."));
SerializedProperty bpNumStackedVectorObs = serializedBrain.FindProperty("brainParameters.numStackedVectorObservations");
EditorGUILayout.PropertyField(bpNumStackedVectorObs, new GUIContent("Stacked Vectors", "Number of states that" +
" will be stacked before beeing fed to the neural network."));
EditorGUI.indentLevel--;
SerializedProperty bpCamResol = serializedBrain.FindProperty("brainParameters.cameraResolutions");
EditorGUILayout.PropertyField(bpCamResol, new GUIContent("Visual Observation", "Describes height, " +
"width, and whether to greyscale visual observations for the Brain."), true);
EditorGUILayout.LabelField("Vector Action");
EditorGUI.indentLevel++;
SerializedProperty bpVectorActionType = serializedBrain.FindProperty("brainParameters.vectorActionSpaceType");
EditorGUILayout.PropertyField(bpVectorActionType, new GUIContent("Space Type", "Corresponds to whether state" +
" vector contains a single integer (Discrete) " +
"or a series of real-valued floats (Continuous)."));
SerializedProperty bpVectorActionSize = serializedBrain.FindProperty("brainParameters.vectorActionSize");
EditorGUILayout.PropertyField(bpVectorActionSize, new GUIContent("Space Size", "Length of action vector " +
"for brain (In Continuous state space)." +
"Or number of possible values (In Discrete action space)."));
SerializedProperty bpVectorActionDescription = serializedBrain.FindProperty("brainParameters.vectorActionDescriptions");
EditorGUILayout.PropertyField(bpVectorActionDescription, new GUIContent("Action Descriptions", "A list of strings used to name" +
" the available actions for the Brain."), true);
}
EditorGUI.indentLevel = indentLevel;
if (bt.enumValueIndex < 0) {
bt.enumValueIndex = (int)BrainType.Player;

272
unity-environment/Assets/ML-Agents/Examples/3DBall/3DHardScene.unity