|
|
|
|
|
|
|
|
|
|
Options: |
|
|
|
--help Show this message. |
|
|
|
--max-steps=<n> Maximum number of steps to run environment [default: 1e6]. |
|
|
|
--curriculum Whether to use curriculum for training (requires curriculum json) [default: False] |
|
|
|
--curriculum-path=<path> Path to curriculum json file for environment [default: curriculum.json] |
|
|
|
--max-steps=<n> Maximum number of steps to run environment [default: 1e6]. |
|
|
|
--train Whether to train model, or only run inference [default: True]. |
|
|
|
--train Whether to train model, or only run inference [default: False]. |
|
|
|
--summary-freq=<n> Frequency at which to save training statistics [default: 10000]. |
|
|
|
--save-freq=<n> Frequency at which to save model [default: 50000]. |
|
|
|
--gamma=<n> Reward discount rate [default: 0.99]. |
|
|
|
|
|
|
env_name = options['<env>'] |
|
|
|
keep_checkpoints = int(options['--keep-checkpoints']) |
|
|
|
worker_id = int(options['--worker-id']) |
|
|
|
use_curriculum = options['--curriculum'] |
|
|
|
if use_curriculum: |
|
|
|
curriculum_path = str(options['--curriculum-path']) |
|
|
|
else: |
|
|
|
curriculum_path = None |
|
|
|
|
|
|
|
# Algorithm-specific parameters for tuning |
|
|
|
gamma = float(options['--gamma']) |
|
|
|
|
|
|
hidden_units = int(options['--hidden-units']) |
|
|
|
batch_size = int(options['--batch-size']) |
|
|
|
|
|
|
|
env = UnityEnvironment(file_name=env_name, worker_id=worker_id) |
|
|
|
env = UnityEnvironment(file_name=env_name, worker_id=worker_id, curriculum=curriculum_path) |
|
|
|
print(str(env)) |
|
|
|
brain_name = env.brain_names[0] |
|
|
|
|
|
|
|
|
|
|
init = tf.global_variables_initializer() |
|
|
|
saver = tf.train.Saver(max_to_keep=keep_checkpoints) |
|
|
|
|
|
|
|
|
|
|
|
def get_progress(): |
|
|
|
if use_curriculum: |
|
|
|
if env._curriculum.measure_type == "progress": |
|
|
|
return steps / max_steps |
|
|
|
elif env._curriculum.measure_type == "reward": |
|
|
|
return last_reward |
|
|
|
else: |
|
|
|
return None |
|
|
|
else: |
|
|
|
return None |
|
|
|
|
|
|
|
with tf.Session() as sess: |
|
|
|
# Instantiate model parameters |
|
|
|
if load_model: |
|
|
|
|
|
|
else: |
|
|
|
sess.run(init) |
|
|
|
steps = sess.run(ppo_model.global_step) |
|
|
|
steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward]) |
|
|
|
info = env.reset(train_mode=train_model)[brain_name] |
|
|
|
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name] |
|
|
|
info = env.reset(train_mode=train_model)[brain_name] |
|
|
|
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name] |
|
|
|
# Decide and take an action |
|
|
|
new_info = trainer.take_action(info, env, brain_name) |
|
|
|
info = new_info |
|
|
|
|
|
|
save_model(sess, model_path=model_path, steps=steps, saver=saver) |
|
|
|
steps += 1 |
|
|
|
sess.run(ppo_model.increment_step) |
|
|
|
if len(trainer.stats['cumulative_reward']) > 0: |
|
|
|
mean_reward = np.mean(trainer.stats['cumulative_reward']) |
|
|
|
print(mean_reward) |
|
|
|
sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward}) |
|
|
|
last_reward = sess.run(ppo_model.last_reward) |
|
|
|
# Final save Tensorflow model |
|
|
|
if steps != 0 and train_model: |
|
|
|
save_model(sess, model_path=model_path, steps=steps, saver=saver) |