浏览代码

Reorganize learn.py (#302)

Split learn.py into learn.py as command-line wrapper, and trainer_controller.py as core trainer/env logic.
/develop-generalizationTraining-TrainerController
GitHub 7 年前
当前提交
e676017b
共有 8 个文件被更改,包括 425 次插入420 次删除
  1. 198
      python/learn.py
  2. 190
      python/test_unityagents.py
  3. 58
      python/trainers/imitation_trainer.py
  4. 34
      python/trainers/ppo_models.py
  5. 9
      python/trainers/ppo_trainer.py
  6. 108
      unity-environment/Assets/ML-Agents/Examples/GridWorld/GridWorld.unity
  7. 248
      python/trainer_controller.py
  8. 0
      /python/trainer_config.yaml

198
python/learn.py


# # Unity ML Agents
# ## ML-Agent Learning
# Launches trainers for each External Brains in a Unity Environemnt
import os
import re
import yaml
from trainer_controller import TrainerController
from datetime import datetime
from trainers.ghost_trainer import GhostTrainer
from trainers.ppo_models import *
from trainers.ppo_trainer import PPOTrainer
from trainers.imitation_trainer import ImitationTrainer
from unityagents import UnityEnvironment, UnityEnvironmentException
def get_progress():
if curriculum_file is not None:
if env.curriculum.measure_type == "progress":
progress = 0
for brain_name in env.external_brain_names:
progress += trainers[brain_name].get_step / trainers[brain_name].get_max_steps
return progress / len(env.external_brain_names)
elif env.curriculum.measure_type == "reward":
progress = 0
for brain_name in env.external_brain_names:
progress += trainers[brain_name].get_last_reward
return progress
else:
return None
else:
return None
ppo (<env>) [options]
learn (<env>) [options]
Options:
--help Show this message.

--load Whether to load the model or randomly initialize [default: False].
--run-path=<path> The sub-directory name for model and summary statistics [default: ppo].
--run-id=<path> The sub-directory name for model and summary statistics [default: ppo].
--save-freq=<n> Frequency at which to save model [default: 50000].
--seed=<n> Random seed used for training [default: None].
--slow Whether to run the game at training speed [default: False].

logger.info(options)
# General parameters
model_path = './models/{}'.format(str(options['--run-path']))
run_id = options['--run-id']
seed = int(options['--seed'])
load_model = options['--load']
train_model = options['--train']

lesson = int(options['--lesson'])
fast_simulation = not bool(options['--slow'])
if seed is None:
seed = datetime.now()
np.random.seed(seed)
tf.set_random_seed(seed)
env = UnityEnvironment(file_name=env_name, worker_id=worker_id, curriculum=curriculum_file, seed=seed)
env.curriculum.set_lesson_number(lesson)
logger.info(str(env))
tf.reset_default_graph()
try:
if not os.path.exists(model_path):
os.makedirs(model_path)
except:
raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
" Please make sure the permissions are set correctly.".format(model_path))
try:
with open("trainer_configurations.yaml") as data_file:
trainer_configurations = yaml.load(data_file)
except IOError:
raise UnityEnvironmentException("The file {} could not be found. Will use default Hyperparameters"
.format("trainer_configurations.yaml"))
except UnicodeDecodeError:
raise UnityEnvironmentException("There was an error decoding {}".format("trainer_configurations.yaml"))
with tf.Session() as sess:
trainers = {}
trainer_parameters_dict = {}
for brain_name in env.external_brain_names:
trainer_parameters = trainer_configurations['default'].copy()
if len(env.external_brain_names) > 1:
graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
trainer_parameters['graph_scope'] = graph_scope
trainer_parameters['summary_path'] = './summaries/{}'.format(
str(options['--run-path'])) + '_' + graph_scope
else:
trainer_parameters['graph_scope'] = ''
trainer_parameters['summary_path'] = './summaries/{}'.format(str(options['--run-path']))
if brain_name in trainer_configurations:
_brain_key = brain_name
while not isinstance(trainer_configurations[_brain_key], dict):
_brain_key = trainer_configurations[_brain_key]
for k in trainer_configurations[_brain_key]:
trainer_parameters[k] = trainer_configurations[_brain_key][k]
trainer_parameters_dict[brain_name] = trainer_parameters.copy()
for brain_name in env.external_brain_names:
if 'is_ghost' not in trainer_parameters_dict[brain_name]:
trainer_parameters_dict[brain_name]['is_ghost'] = False
if 'is_imitation' not in trainer_parameters_dict[brain_name]:
trainer_parameters_dict[brain_name]['is_imitation'] = False
if trainer_parameters_dict[brain_name]['is_ghost']:
if trainer_parameters_dict[brain_name]['brain_to_copy'] not in env.external_brain_names:
raise UnityEnvironmentException("The external brain {0} could not be found in the environment "
"even though the ghost trainer of brain {1} is trying to ghost it."
.format(trainer_parameters_dict[brain_name]['brain_to_copy'],
brain_name))
trainer_parameters_dict[brain_name]['original_brain_parameters'] = trainer_parameters_dict[
trainer_parameters_dict[brain_name]['brain_to_copy']]
trainers[brain_name] = GhostTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model, seed)
elif trainer_parameters_dict[brain_name]['is_imitation']:
trainers[brain_name] = ImitationTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model, seed)
else:
trainers[brain_name] = PPOTrainer(sess, env, brain_name, trainer_parameters_dict[brain_name],
train_model, seed)
for k, t in trainers.items():
logger.info(t)
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=keep_checkpoints)
# Instantiate model parameters
if load_model:
logger.info('Loading Model...')
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt is None:
logger.info('The model {0} could not be found. Make sure you specified the right '
'--run-path'.format(model_path))
saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(init)
global_step = 0 # This is only for saving the model
env.curriculum.increment_lesson(get_progress())
info = env.reset(train_mode=fast_simulation)
if train_model:
for brain_name, trainer in trainers.items():
trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
try:
while any([t.get_step <= t.get_max_steps for k, t in trainers.items()]) or not train_model:
if env.global_done:
env.curriculum.increment_lesson(get_progress())
info = env.reset(train_mode=fast_simulation)
for brain_name, trainer in trainers.items():
trainer.end_episode()
# Decide and take an action
take_action_actions, take_action_memories, take_action_values, take_action_outputs = {}, {}, {}, {}
for brain_name, trainer in trainers.items():
(take_action_actions[brain_name],
take_action_memories[brain_name],
take_action_values[brain_name],
take_action_outputs[brain_name]) = trainer.take_action(info)
new_info = env.step(action=take_action_actions, memory=take_action_memories, value=take_action_values)
for brain_name, trainer in trainers.items():
trainer.add_experiences(info, new_info, take_action_outputs[brain_name])
info = new_info
for brain_name, trainer in trainers.items():
trainer.process_experiences(info)
if trainer.is_ready_update() and train_model and trainer.get_step <= trainer.get_max_steps:
# Perform gradient descent with experience buffer
trainer.update_model()
# Write training statistics to tensorboard.
trainer.write_summary(env.curriculum.lesson_number)
if train_model and trainer.get_step <= trainer.get_max_steps:
trainer.increment_step()
trainer.update_last_reward()
if train_model and trainer.get_step <= trainer.get_max_steps:
global_step += 1
if global_step % save_freq == 0 and global_step != 0 and train_model:
# Save Tensorflow model
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
# Final save Tensorflow model
if global_step != 0 and train_model:
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
except KeyboardInterrupt:
if train_model:
logger.info("Learning was interrupted. Please wait while the graph is generated.")
save_model(sess, model_path=model_path, steps=global_step, saver=saver)
pass
env.close()
if train_model:
graph_name = (env_name.strip()
.replace('.app', '').replace('.exe', '').replace('.x86_64', '').replace('.x86', ''))
graph_name = os.path.basename(os.path.normpath(graph_name))
nodes = []
scopes = []
for brain_name in trainers.keys():
if trainers[brain_name].graph_scope is not None:
scope = trainers[brain_name].graph_scope + '/'
if scope == '/':
scope = ''
scopes += [scope]
if trainers[brain_name].parameters["is_imitation"]:
nodes += [scope + x for x in ["action"]]
elif not trainers[brain_name].parameters["use_recurrent"]:
nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
else:
nodes += [scope + x for x in ["action", "value_estimate", "action_probs", "recurrent_out"]]
export_graph(model_path, graph_name, target_nodes=','.join(nodes))
if len(scopes) > 1:
logger.info("List of available scopes :")
for scope in scopes:
logger.info("\t" + scope)
logger.info("List of nodes exported :")
for n in nodes:
logger.info("\t" + n)
tc = TrainerController(env_name, run_id, save_freq, curriculum_file, fast_simulation, load_model, train_model,
worker_id, keep_checkpoints, lesson, seed)
tc.start_learning()

190
python/test_unityagents.py


import json
import numpy as np
import os
import socket
import mock
import json
import tensorflow as tf
from trainers.buffer import Buffer
from trainers.ppo_models import *
BrainInfo, BrainParameters, Curriculum
BrainInfo, Curriculum
from trainers.ppo_models import *
from trainers.buffer import Buffer
def append_length(input):
return struct.pack("I", len(input.encode())) + input.encode()

}'''.encode()
dummy_reset = [
'CONFIG_REQUEST'.encode(),
append_length(
'''
{
"brain_name": "RealFakeBrain",
"agents": [1,2],
"states": [1,2,3,4,5,6],
"rewards": [1,2],
"actions": [1,2,3,4],
"memories": [],
"dones": [false, false]
}'''),
'False'.encode()]
'CONFIG_REQUEST'.encode(),
append_length(
'''
{
"brain_name": "RealFakeBrain",
"agents": [1,2],
"states": [1,2,3,4,5,6],
"rewards": [1,2],
"actions": [1,2,3,4],
"memories": [],
"dones": [false, false]
}'''),
'False'.encode()]
append_length('''
append_length('''
{
"brain_name": "RealFakeBrain",
"agents": [1,2,3],

"memories": [],
"dones": [false, false, false]
}'''),
'False'.encode(),
'actions'.encode(),
append_length('''
'False'.encode(),
'actions'.encode(),
append_length('''
{
"brain_name": "RealFakeBrain",
"agents": [1,2,3],

"memories": [],
"dones": [false, false, true]
}'''),
'True'.encode()]
'True'.encode()]
def test_handles_bad_filename():
with pytest.raises(UnityEnvironmentException):

env.step([0])
assert env.brain_names[0] == 'RealFakeBrain'
env.close()
def test_reset():
with mock.patch('subprocess.Popen') as mock_subproc_popen:

mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
mock_socket.recv.side_effect = dummy_reset
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
assert brain_info['RealFakeBrain'].states.shape[1] == brain.state_space_size
def test_step():

mock_socket.recv.return_value.decode.return_value = dummy_start
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
mock_socket.recv.side_effect = dummy_reset
mock_socket.recv.side_effect = dummy_reset
brain_info = env.reset()
mock_socket.recv.side_effect = dummy_step
brain_info = env.step([0] * brain.action_space_size * len(brain_info['RealFakeBrain'].agents))

assert env.global_done
assert isinstance(brain_info, dict)
assert isinstance(brain_info['RealFakeBrain'], BrainInfo)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].observations, list)
assert isinstance(brain_info['RealFakeBrain'].states, np.ndarray)
assert len(brain_info['RealFakeBrain'].observations) == brain.number_observations
assert brain_info['RealFakeBrain'].states.shape[0] == len(brain_info['RealFakeBrain'].agents)

def test_close():
with mock.patch('subprocess.Popen') as mock_subproc_popen:
with mock.patch('socket.socket') as mock_socket:

env.close()
assert not env._loaded
mock_socket.close.assert_called_once()
dummy_curriculum= json.loads('''{
dummy_curriculum = json.loads('''{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,

"param3" : [0.2, 0.3, 0.7, 0.9]
}
}''')
bad_curriculum= json.loads('''{
bad_curriculum = json.loads('''{
"measure" : "reward",
"thresholds" : [10, 20, 50],
"min_lesson_length" : 3,

}''')
with mock.patch(open_name, create=True) as mock_open:
mock_open.return_value = 0
mock_load.return_value = bad_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1,"param3":1})
mock_load.return_value = dummy_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1})
curriculum = Curriculum('test_unityagents.py', {"param1":1,"param2":1,"param3":1})
assert curriculum.get_lesson_number == 0
curriculum.set_lesson_number(1)
assert curriculum.get_lesson_number == 1
curriculum.increment_lesson(10)
assert curriculum.get_lesson_number == 1
curriculum.increment_lesson(30)
curriculum.increment_lesson(30)
assert curriculum.get_lesson_number == 1
assert curriculum.lesson_length == 3
curriculum.increment_lesson(30)
assert curriculum.get_config() == {'param1': 0.3, 'param2': 20, 'param3': 0.7}
assert curriculum.get_config(0) == {"param1":0.7,"param2":100,"param3":0.2}
assert curriculum.lesson_length == 0
assert curriculum.get_lesson_number == 2
with mock.patch(open_name, create=True) as mock_open:
mock_open.return_value = 0
mock_load.return_value = bad_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1": 1, "param2": 1, "param3": 1})
mock_load.return_value = dummy_curriculum
with pytest.raises(UnityEnvironmentException):
curriculum = Curriculum('test_unityagents.py', {"param1": 1, "param2": 1})
curriculum = Curriculum('test_unityagents.py', {"param1": 1, "param2": 1, "param3": 1})
assert curriculum.get_lesson_number == 0
curriculum.set_lesson_number(1)
assert curriculum.get_lesson_number == 1
curriculum.increment_lesson(10)
assert curriculum.get_lesson_number == 1
curriculum.increment_lesson(30)
curriculum.increment_lesson(30)
assert curriculum.get_lesson_number == 1
assert curriculum.lesson_length == 3
curriculum.increment_lesson(30)
assert curriculum.get_config() == {'param1': 0.3, 'param2': 20, 'param3': 0.7}
assert curriculum.get_config(0) == {"param1": 0.7, "param2": 100, "param3": 0.2}
assert curriculum.lesson_length == 0
assert curriculum.get_lesson_number == 2
c_action_c_state_start = '''{

"stateSpaceType": 1
}]
}'''.encode()
def test_ppo_model_continuous():
tf.reset_default_graph()

# End of mock
# End of mock
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_glob.return_value = ['FakeLaunchPath']

sess.run(init)
run_list = [model.output, model.probs, model.value, model.entropy,
model.learning_rate]
model.learning_rate]
model.sequence_length: 1,
model.state_in : np.array([[1,2,3],[3,4,5]]),
model.epsilon :np.random.randn(2, 2)
}
sess.run(run_list, feed_dict = feed_dict)
model.sequence_length: 1,
model.state_in: np.array([[1, 2, 3], [3, 4, 5]]),
model.epsilon: np.random.randn(2, 2)
}
sess.run(run_list, feed_dict=feed_dict)
d_action_c_state_start = '''{
"AcademyName": "RealFakeAcademy",

}]
}'''.encode()
# End of mock
# End of mock
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_glob.return_value = ['FakeLaunchPath']

sess.run(init)
run_list = [model.output, model.probs, model.value, model.entropy,
model.learning_rate]
model.learning_rate]
model.sequence_length: 1,
model.state_in : np.array([[1,2,3],[3,4,5]]),
model.observation_in[0] : np.ones([2,40,30,3])
}
sess.run(run_list, feed_dict = feed_dict)
model.sequence_length: 1,
model.state_in: np.array([[1, 2, 3], [3, 4, 5]]),
model.observation_in[0]: np.ones([2, 40, 30, 3])
}
sess.run(run_list, feed_dict=feed_dict)
def assert_array(a, b):
assert a.shape == b.shape

for fake_agent_id in range(4):
for i in range(9):
b[fake_agent_id]['state'].append(
[100*fake_agent_id+10*i +1, 100*fake_agent_id+10*i +2, 100*fake_agent_id+10*i +3]
)
b[fake_agent_id]['action'].append([100*fake_agent_id+10*i +4,100*fake_agent_id+10*i +5])
a = b[1]['state'].get_batch(batch_size = 2, training_length = None, sequential = True)
assert_array(a, np.array([[171,172,173], [181,182,183]]))
a = b[2]['state'].get_batch(batch_size = 2, training_length = 3, sequential = True)
[100 * fake_agent_id + 10 * i + 1, 100 * fake_agent_id + 10 * i + 2, 100 * fake_agent_id + 10 * i + 3]
)
b[fake_agent_id]['action'].append([100 * fake_agent_id + 10 * i + 4, 100 * fake_agent_id + 10 * i + 5])
a = b[1]['state'].get_batch(batch_size=2, training_length=None, sequential=True)
assert_array(a, np.array([[171, 172, 173], [181, 182, 183]]))
a = b[2]['state'].get_batch(batch_size=2, training_length=3, sequential=True)
[[231,232,233], [241,242,243], [251,252,253]],
[[261,262,263], [271,272,273], [281,282,283]]
]))
a = b[2]['state'].get_batch(batch_size = 2, training_length = 3, sequential = False)
[[231, 232, 233], [241, 242, 243], [251, 252, 253]],
[[261, 262, 263], [271, 272, 273], [281, 282, 283]]
]))
a = b[2]['state'].get_batch(batch_size=2, training_length=3, sequential=False)
[[251,252,253], [261,262,263], [271,272,273]],
[[261,262,263], [271,272,273], [281,282,283]]
]))
[[251, 252, 253], [261, 262, 263], [271, 272, 273]],
[[261, 262, 263], [271, 272, 273], [281, 282, 283]]
]))
b.append_update_buffer(3,
batch_size = None, training_length=2)
b.append_update_buffer(2,
batch_size = None, training_length=2)
b.append_update_buffer(3,
batch_size=None, training_length=2)
b.append_update_buffer(2,
batch_size=None, training_length=2)
assert np.array(b.update_buffer['action']).shape == (10,2,2)
assert np.array(b.update_buffer['action']).shape == (10, 2, 2)
if __name__ == '__main__':

58
python/trainers/imitation_trainer.py


import tensorflow as tf
from trainers.buffer import Buffer
from trainers.ppo_models import *
from trainers.trainer import UnityTrainerException, Trainer
logger = logging.getLogger("unityagents")

class ImitationTrainer(Trainer):
"""The ImitationTrainer is an implementation of the imitation learning."""
def __init__(self, sess, env, brain_name, trainer_parameters, training, seed):
"""
Responsible for collecting experiences and training PPO model.

:param training: Whether the trainer is set for training.
"""
self.param_keys = [ 'is_imitation', 'brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope',
'summary_freq', 'max_steps', 'batches_per_epoch']
self.param_keys = ['is_imitation', 'brain_to_imitate', 'batch_size', 'time_horizon', 'graph_scope',
'summary_freq', 'max_steps', 'batches_per_epoch']
"brain {1}.".format(k, brain_name))
"brain {1}.".format(k, brain_name))
super(ImitationTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training)

self.step = 0
self.cumulative_rewards = {}
self.episode_steps = {}
self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward' : []}
self.stats = {'losses': [], 'episode_length': [], 'cumulative_reward': []}
logger.log('Cannot use observations with imitation learning')
logger.info('Cannot use observations with imitation learning')
self.use_states = (env.brains[brain_name].state_space_size > 0)
self.summary_path = trainer_parameters['summary_path']
if not os.path.exists(self.summary_path):

a_size = self.brain.action_space_size
with tf.variable_scope(self.variable_scope):
tf.set_random_seed(seed)
self.network = ImitationNN(state_size = s_size,
action_size = a_size,
h_size = int(trainer_parameters['hidden_units']),
lr = float(trainer_parameters['learning_rate']),
action_type = self.brain.action_space_type,
n_layers=int(trainer_parameters['num_layers']))
self.network = ImitationNN(state_size=s_size,
action_size=a_size,
h_size=int(trainer_parameters['hidden_units']),
lr=float(trainer_parameters['learning_rate']),
action_type=self.brain.action_space_type,
n_layers=int(trainer_parameters['num_layers']))
def __str__(self):

"""
return
def take_action(self, info):
"""
Decides actions given state/observation information, and takes them in environment.

E = info[self.brain_name]
agent_action = self.sess.run(self.network.sample_action, feed_dict={self.network.state: E.states})
return (agent_action, None, None, None)
return agent_action, None, None, None
def add_experiences(self, info, next_info, take_action_outputs):
"""

info_P = info[self.brain_to_imitate]
for l in range(len(info_P.agents)):
if ((info_P.local_done[l] or
len(self.training_buffer[info_P.agents[l]]['actions']) > self.trainer_parameters['time_horizon'])
if ((info_P.local_done[l] or
len(self.training_buffer[info_P.agents[l]]['actions']) > self.trainer_parameters['time_horizon'])
self.training_buffer.append_update_buffer(agent_id,
batch_size = None, training_length=None)
self.training_buffer.append_update_buffer(agent_id, batch_size=None, training_length=None)
self.training_buffer[agent_id].reset_agent()
info_E = info[self.brain_name]

self.cumulative_rewards[agent_id] = 0
self.episode_steps[agent_id] = 0
def end_episode(self):
"""
A signal that the Episode has ended. The buffer must be reset.

"""
batch_size = self.trainer_parameters['batch_size']
for j in range(min(len(self.training_buffer.update_buffer['actions']) // self.batch_size, self.batches_per_epoch)):
for j in range(
min(len(self.training_buffer.update_buffer['actions']) // self.batch_size, self.batches_per_epoch)):
self.network.state: batch_states.reshape([-1, 1]),
self.network.state: batch_states.reshape([-1, 1]),
self.network.state: batch_states.reshape([self.batch_size, -1]),
self.network.state: batch_states.reshape([self.batch_size, -1]),
self.network.true_action: batch_actions.reshape([self.batch_size, -1])
}
loss, _ = self.sess.run([self.network.loss, self.network.update], feed_dict=feed_dict)

else:
self.stats['losses'].append(0)
def write_summary(self, lesson_number):
"""
Saves training statistics to Tensorboard.

self.is_training and self.get_step <= self.get_max_steps):
self.is_training and self.get_step <= self.get_max_steps):
.format(self.brain_name, steps, mean_reward, np.std(self.stats['cumulative_reward'])))
.format(self.brain_name, steps, mean_reward, np.std(self.stats['cumulative_reward'])))
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:

summary.value.add(tag='Info/Lesson', simple_value=lesson_number)
self.summary_writer.add_summary(summary, steps)
self.summary_writer.flush()

34
python/trainers/ppo_models.py


import tensorflow as tf
import tensorflow.contrib.layers as c_layers
from tensorflow.python.tools import freeze_graph
from unityagents import UnityEnvironmentException
logger = logging.getLogger("unityagents")

if brain.action_space_type == "discrete":
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, use_recurrent, num_layers,
m_size)
def save_model(sess, saver, model_path="./", steps=0):
"""
Saves current model to checkpoint folder.
:param sess: Current Tensorflow session.
:param model_path: Designated model path.
:param steps: Current number of steps in training process.
:param saver: Tensorflow saver for session.
"""
last_checkpoint = model_path + '/model-' + str(steps) + '.cptk'
saver.save(sess, last_checkpoint)
tf.train.write_graph(sess.graph_def, model_path, 'raw_graph_def.pb', as_text=False)
logger.info("Saved Model")
def export_graph(model_path, env_name="env", target_nodes="action,value_estimate,action_probs"):
"""
Exports latest saved model to .bytes format for Unity embedding.
:param model_path: path of model checkpoints.
:param env_name: Name of associated Learning Environment.
:param target_nodes: Comma separated string of needed output nodes for embedded graph.
"""
ckpt = tf.train.get_checkpoint_state(model_path)
freeze_graph.freeze_graph(input_graph=model_path + '/raw_graph_def.pb',
input_binary=True,
input_checkpoint=ckpt.model_checkpoint_path,
output_node_names=target_nodes,
output_graph=model_path + '/' + env_name + '.bytes',
clear_devices=True, initializer_nodes="", input_saver="",
restore_op_name="save/restore_all", filename_tensor_name="save/Const:0")
class PPOModel(object):

9
python/trainers/ppo_trainer.py


import tensorflow as tf
from trainers.buffer import Buffer
from trainers.ppo_models import *
from trainers.ppo_models import create_agent_model
from trainers.trainer import UnityTrainerException, Trainer
logger = logging.getLogger("unityagents")

if len(self.stats['cumulative_reward']) > 0:
mean_reward = np.mean(self.stats['cumulative_reward'])
self.sess.run(self.model.update_reward, feed_dict={self.model.new_reward: mean_reward})
last_reward = self.sess.run(self.model.last_reward)
def running_average(self, data, steps, running_mean, running_variance):
"""

for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]['actions']
if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon'])
and len(agent_actions) > 0):
and len(agent_actions) > 0):
if info.local_done[l] and not info.max_reached[l]:
value_next = 0.0
else:

gamma=self.trainer_parameters['gamma'],
lambd=self.trainer_parameters['lambd'])
)
self.training_buffer[agent_id]['discounted_returns'].set( \
self.training_buffer[agent_id]['advantages'].get_batch() \
self.training_buffer[agent_id]['discounted_returns'].set(
self.training_buffer[agent_id]['advantages'].get_batch()
+ self.training_buffer[agent_id]['value_estimates'].get_batch())
self.training_buffer.append_update_buffer(agent_id,

108
unity-environment/Assets/ML-Agents/Examples/GridWorld/GridWorld.unity


m_Father: {fileID: 0}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 45, y: 45, z: 0}
--- !u!114 &299967728
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 943466ab374444748a364f9d6c3e2fe2, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
broadcast: 1
brain: {fileID: 1535917239}
--- !u!1 &363761396
GameObject:
m_ObjectHideFlags: 0

m_AnchoredPosition: {x: 0, y: 0}
m_SizeDelta: {x: 0, y: 0}
m_Pivot: {x: 0, y: 0}
--- !u!114 &467853281
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 943466ab374444748a364f9d6c3e2fe2, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
broadcast: 1
brain: {fileID: 1535917239}
--- !u!1 &486401523
GameObject:
m_ObjectHideFlags: 0

m_OcclusionCulling: 1
m_StereoConvergence: 10
m_StereoSeparation: 0.022
--- !u!114 &551668186
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 35813a1be64e144f887d7d5f15b963fa, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
brain: {fileID: 1535917239}
--- !u!114 &633896473
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 41e9bda8f3cf1492fa74926a530f6f70, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
broadcast: 1
continuousPlayerActions: []
discretePlayerActions:
- key: 273
value: 0
- key: 274
value: 1
- key: 276
value: 2
- key: 275
value: 3
defaultAction: -1
brain: {fileID: 1535917239}
--- !u!1 &742849316
GameObject:
m_ObjectHideFlags: 0

m_Father: {fileID: 0}
m_RootOrder: 3
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!114 &780827900
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 41e9bda8f3cf1492fa74926a530f6f70, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
broadcast: 1
continuousPlayerActions: []
discretePlayerActions:
- key: 273
value: 0
- key: 274
value: 1
- key: 276
value: 2
- key: 275
value: 3
defaultAction: -1
brain: {fileID: 1535917239}
--- !u!1 &959566328
GameObject:
m_ObjectHideFlags: 0

- Right
actionSpaceType: 0
stateSpaceType: 1
brainType: 3
brainType: 2
- {fileID: 780827900}
- {fileID: 299967728}
- {fileID: 2102493396}
- {fileID: 633896473}
- {fileID: 467853281}
- {fileID: 551668186}
instanceID: 12840
instanceID: 14224
--- !u!1 &1553342942
GameObject:
m_ObjectHideFlags: 0

memory: []
id: 0
academy: {fileID: 2047663}
--- !u!114 &2102493396
MonoBehaviour:
m_ObjectHideFlags: 0
m_PrefabParentObject: {fileID: 0}
m_PrefabInternal: {fileID: 0}
m_GameObject: {fileID: 0}
m_Enabled: 1
m_EditorHideFlags: 0
m_Script: {fileID: 11500000, guid: 35813a1be64e144f887d7d5f15b963fa, type: 3}
m_Name: (Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)(Clone)
m_EditorClassIdentifier:
brain: {fileID: 1535917239}

248
python/trainer_controller.py


# # Unity ML Agents
# ## ML-Agent Learning
# Launches trainers for each External Brains in a Unity Environment
import logging
import numpy as np
import os
import re
import tensorflow as tf
import yaml
from datetime import datetime
from tensorflow.python.tools import freeze_graph
from trainers.ghost_trainer import GhostTrainer
from trainers.ppo_trainer import PPOTrainer
from trainers.imitation_trainer import ImitationTrainer
from unityagents import UnityEnvironment, UnityEnvironmentException
class TrainerController(object):
def __init__(self, env_name, run_id, save_freq, curriculum_file, fast_simulation, load, train,
worker_id, keep_checkpoints, lesson, seed):
self.model_path = './models/{}'.format(run_id)
self.logger = logging.getLogger("unityagents")
self.run_id = run_id
self.save_freq = save_freq
self.curriculum_file = curriculum_file
self.lesson = lesson
self.fast_simulation = fast_simulation
self.load_model = load
self.train_model = train
self.worker_id = worker_id
self.keep_checkpoints = keep_checkpoints
self.trainers = {}
if seed is None:
seed = datetime.now()
self.seed = seed
np.random.seed(self.seed)
tf.set_random_seed(self.seed)
self.env = UnityEnvironment(file_name=env_name, worker_id=self.worker_id,
curriculum=self.curriculum_file, seed=self.seed)
self.env_name = (env_name.strip().replace('.app', '').replace('.exe', '').replace('.x86_64', '')
.replace('.x86', ''))
self.env_name = os.path.basename(os.path.normpath(self.env_name))
def _get_progress(self):
if self.curriculum_file is not None:
if self.env.curriculum.measure_type == "progress":
progress = 0
for brain_name in self.env.external_brain_names:
progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps
return progress / len(self.env.external_brain_names)
elif self.env.curriculum.measure_type == "reward":
progress = 0
for brain_name in self.env.external_brain_names:
progress += self.trainers[brain_name].get_last_reward
return progress
else:
return None
else:
return None
def _process_graph(self):
nodes = []
scopes = []
for brain_name in self.trainers.keys():
if self.trainers[brain_name].graph_scope is not None:
scope = self.trainers[brain_name].graph_scope + '/'
if scope == '/':
scope = ''
scopes += [scope]
if self.trainers[brain_name].parameters["is_imitation"]:
nodes += [scope + x for x in ["action"]]
elif not self.trainers[brain_name].parameters["use_recurrent"]:
nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]]
else:
nodes += [scope + x for x in ["action", "value_estimate", "action_probs", "recurrent_out"]]
if len(scopes) > 1:
self.logger.info("List of available scopes :")
for scope in scopes:
self.logger.info("\t" + scope)
self.logger.info("List of nodes to export :")
for n in nodes:
self.logger.info("\t" + n)
return nodes
def _save_model(self, sess, saver, model_path="./", steps=0):
"""
Saves current model to checkpoint folder.
:param sess: Current Tensorflow session.
:param model_path: Designated model path.
:param steps: Current number of steps in training process.
:param saver: Tensorflow saver for session.
"""
last_checkpoint = model_path + '/model-' + str(steps) + '.cptk'
saver.save(sess, last_checkpoint)
tf.train.write_graph(sess.graph_def, model_path, 'raw_graph_def.pb', as_text=False)
self.logger.info("Saved Model")
def _export_graph(self):
"""
Exports latest saved model to .bytes format for Unity embedding.
"""
target_nodes = ','.join(self._process_graph())
ckpt = tf.train.get_checkpoint_state(self.model_path)
freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb',
input_binary=True,
input_checkpoint=ckpt.model_checkpoint_path,
output_node_names=target_nodes,
output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes',
clear_devices=True, initializer_nodes="", input_saver="",
restore_op_name="save/restore_all", filename_tensor_name="save/Const:0")
def _initialize_trainers(self, trainer_config, sess):
trainer_parameters_dict = {}
for brain_name in self.env.external_brain_names:
trainer_parameters = trainer_config['default'].copy()
if len(self.env.external_brain_names) > 1:
graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
trainer_parameters['graph_scope'] = graph_scope
trainer_parameters['summary_path'] = './summaries/{}'.format(
str(self.run_id)) + '_' + graph_scope
else:
trainer_parameters['graph_scope'] = ''
trainer_parameters['summary_path'] = './summaries/{}'.format(self.run_id)
if brain_name in trainer_config:
_brain_key = brain_name
while not isinstance(trainer_config[_brain_key], dict):
_brain_key = trainer_config[_brain_key]
for k in trainer_config[_brain_key]:
trainer_parameters[k] = trainer_config[_brain_key][k]
trainer_parameters_dict[brain_name] = trainer_parameters.copy()
for brain_name in self.env.external_brain_names:
if 'is_ghost' not in trainer_parameters_dict[brain_name]:
trainer_parameters_dict[brain_name]['is_ghost'] = False
if 'is_imitation' not in trainer_parameters_dict[brain_name]:
trainer_parameters_dict[brain_name]['is_imitation'] = False
if trainer_parameters_dict[brain_name]['is_ghost']:
if trainer_parameters_dict[brain_name]['brain_to_copy'] not in self.env.external_brain_names:
raise UnityEnvironmentException(
"The external brain {0} could not be found in the environment "
"even though the ghost trainer of brain {1} is trying to ghost it."
.format(trainer_parameters_dict[brain_name]['brain_to_copy'],
brain_name))
trainer_parameters_dict[brain_name]['original_brain_parameters'] = trainer_parameters_dict[
trainer_parameters_dict[brain_name]['brain_to_copy']]
self.trainers[brain_name] = GhostTrainer(sess, self.env, brain_name,
trainer_parameters_dict[brain_name],
self.train_model, self.seed)
elif trainer_parameters_dict[brain_name]['is_imitation']:
self.trainers[brain_name] = ImitationTrainer(sess, self.env, brain_name,
trainer_parameters_dict[brain_name],
self.train_model, self.seed)
else:
self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name],
self.train_model, self.seed)
def start_learning(self):
self.env.curriculum.set_lesson_number(self.lesson)
self.logger.info(str(self.env))
tf.reset_default_graph()
try:
with open("trainer_config.yaml") as data_file:
trainer_config = yaml.load(data_file)
except IOError:
raise UnityEnvironmentException("The file {} could not be found. Will use default Hyperparameters"
.format("trainer_config.yaml"))
except UnicodeDecodeError:
raise UnityEnvironmentException("There was an error decoding {}".format("trainer_config.yaml"))
try:
if not os.path.exists(self.model_path):
os.makedirs(self.model_path)
except Exception:
raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed."
" Please make sure the permissions are set correctly."
.format(self.model_path))
with tf.Session() as sess:
self._initialize_trainers(trainer_config, sess)
for k, t in self.trainers.items():
self.logger.info(t)
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=self.keep_checkpoints)
# Instantiate model parameters
if self.load_model:
self.logger.info('Loading Model...')
ckpt = tf.train.get_checkpoint_state(self.model_path)
if ckpt is None:
self.logger.info('The model {0} could not be found. Make sure you specified the right '
'--run-id'.format(self.model_path))
saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(init)
global_step = 0 # This is only for saving the model
self.env.curriculum.increment_lesson(self._get_progress())
info = self.env.reset(train_mode=self.fast_simulation)
if self.train_model:
for brain_name, trainer in self.trainers.items():
trainer.write_tensorboard_text('Hyperparameters', trainer.parameters)
try:
while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model:
if self.env.global_done:
self.env.curriculum.increment_lesson(self._get_progress())
info = self.env.reset(train_mode=self.fast_simulation)
for brain_name, trainer in self.trainers.items():
trainer.end_episode()
# Decide and take an action
take_action_actions, take_action_memories, take_action_values, take_action_outputs = {}, {}, {}, {}
for brain_name, trainer in self.trainers.items():
(take_action_actions[brain_name],
take_action_memories[brain_name],
take_action_values[brain_name],
take_action_outputs[brain_name]) = trainer.take_action(info)
new_info = self.env.step(action=take_action_actions, memory=take_action_memories,
value=take_action_values)
for brain_name, trainer in self.trainers.items():
trainer.add_experiences(info, new_info, take_action_outputs[brain_name])
info = new_info
for brain_name, trainer in self.trainers.items():
trainer.process_experiences(info)
if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps:
# Perform gradient descent with experience buffer
trainer.update_model()
# Write training statistics to tensorboard.
trainer.write_summary(self.env.curriculum.lesson_number)
if self.train_model and trainer.get_step <= trainer.get_max_steps:
trainer.increment_step()
trainer.update_last_reward()
if self.train_model and trainer.get_step <= trainer.get_max_steps:
global_step += 1
if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
# Save Tensorflow model
self._save_model(sess, model_path=self.model_path, steps=global_step, saver=saver)
# Final save Tensorflow model
if global_step != 0 and self.train_model:
self._save_model(sess, model_path=self.model_path, steps=global_step, saver=saver)
except KeyboardInterrupt:
if self.train_model:
self.logger.info("Learning was interrupted. Please wait while the graph is generated.")
self._save_model(sess, model_path=self.model_path, steps=global_step, saver=saver)
pass
self.env.close()
if self.train_model:
self._export_graph()

/python/trainer_configurations.yaml → /python/trainer_config.yaml

正在加载...
取消
保存