浏览代码

PPO additions and warnings

* Add linear decay to learning rate for PPO
* Add warning/exception for unsupported brain configurations w/ PPO
/develop-generalizationTraining-TrainerController
Arthur Juliani 7 年前
当前提交
71591043
共有 5 个文件被更改,包括 38 次插入25 次删除
  1. 2
      python/PPO.ipynb
  2. 6
      python/ppo.py
  3. 43
      python/ppo/models.py
  4. 10
      python/ppo/trainer.py
  5. 2
      python/setup.py

2
python/PPO.ipynb


"# Create the Tensorflow model graph\n",
"ppo_model = create_agent_model(env, lr=learning_rate,\n",
" h_size=hidden_units, epsilon=epsilon,\n",
" beta=beta)\n",
" beta=beta, max_step=max_steps)\n",
"\n",
"is_continuous = (env.brains[brain_name].action_space_type == \"continuous\")\n",
"use_observations = (env.brains[brain_name].number_observations > 0)\n",

6
python/ppo.py


Options:
--help Show this message.
--max-step=<n> Maximum number of steps to run environment [default: 5e6].
--max-steps=<n> Maximum number of steps to run environment [default: 5e6].
--run-path=<path> The sub-directory name for model and summary statistics [default: ppo].
--load Whether to load the model or randomly initialize [default: False].
--train Whether to train model, or only run inference [default: True].

print(options)
# General parameters
max_steps = float(options['--max-step'])
max_steps = float(options['--max-steps'])
model_path = './models/{}'.format(str(options['--run-path']))
summary_path = './summaries/{}'.format(str(options['--run-path']))
load_model = options['--load']

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
h_size=hidden_units, epsilon=epsilon,
beta=beta)
beta=beta, max_step=max_steps)
is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)

43
python/ppo/models.py


import tensorflow as tf
import tensorflow.contrib.layers as c_layers
from tensorflow.python.tools import freeze_graph
from unityagents import UnityEnvironmentException
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3):
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6):
"""
Takes a Unity environment and model-specific hyperparameters and returns the
appropriate PPO agent model for the environment.

"""
brain_name = env.brain_names[0]
if env.brains[brain_name].action_space_type == "continuous":
return ContinuousControlModel(lr, env.brains[brain_name].state_space_size,
env.brains[brain_name].action_space_size, h_size, epsilon, beta)
if env.brains[brain_name].number_observations == 0:
return ContinuousControlModel(lr, env.brains[brain_name].state_space_size,
env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
else:
raise UnityEnvironmentException("There is currently no PPO model which supports both a continuous "
"action space and camera observations.")
env.brains[brain_name].action_space_size, h_size, epsilon, beta)
env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
if env.brains[brain_name].state_space_size > 0:
print("This brain contains agents with both observations and states. There is currently no PPO model"
"which supports this. Defaulting to Vision-based PPO model.")
return VisualDiscreteControlModel(lr, h, w, env.brains[brain_name].action_space_size, h_size, epsilon, beta)
return VisualDiscreteControlModel(lr, h, w, env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
def save_model(sess, saver, model_path="./", steps=0):

:param steps: Current number of steps in training process.
:param saver: Tensorflow saver for session.
"""
last_checkpoint = model_path+'/model-'+str(steps)+'.cptk'
last_checkpoint = model_path + '/model-' + str(steps) + '.cptk'
saver.save(sess, last_checkpoint)
tf.train.write_graph(sess.graph_def, model_path, 'raw_graph_def.pb', as_text=False)
print("Saved Model")

class PPOModel(object):
def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr):
def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr, max_step):
"""
Creates training-specific Tensorflow ops for PPO models.
:param probs: Current policy probabilities

self.loss = self.policy_loss + self.value_loss - beta * tf.reduce_mean(entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
self.global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32)
self.learning_rate = tf.train.polynomial_decay(lr, self.global_step,
max_step, 1e-10,
power=1.0)
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32)
self.increment_step = tf.assign(self.global_step, self.global_step+1)
self.increment_step = tf.assign(self.global_step, self.global_step + 1)
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta, max_step):
"""
Creates Continuous Control Actor-Critic model.
:param s_size: State-space size

self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities')
PPOModel.__init__(self, self.probs, self.old_probs, self.value, self.entropy, 0.0, epsilon, lr)
PPOModel.__init__(self, self.probs, self.old_probs, self.value, self.entropy, 0.0, epsilon, lr, max_step)
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta, max_step):
"""
Creates Discrete Control Actor-Critic model.
:param s_size: State-space size

self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1)
PPOModel.__init__(self, self.responsible_probs, self.old_responsible_probs,
self.value, self.entropy, beta, epsilon, lr)
self.value, self.entropy, beta, epsilon, lr, max_step)
def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta):
def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta, max_step):
"""
Creates Discrete Control Actor-Critic model for use with visual observations (images).
:param o_size_h: Observation height.

self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1)
PPOModel.__init__(self, self.responsible_probs, self.old_responsible_probs,
self.value, self.entropy, beta, epsilon, lr)
self.value, self.entropy, beta, epsilon, lr, max_step)

10
python/ppo/trainer.py


self.model = ppo_model
self.sess = sess
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
'entropy': [], 'value_loss': [], 'policy_loss': []}
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
self.stats = stats
self.training_buffer = vectorize_history(empty_local_history({}))

self.model.batch_size: len(info.states)}
else:
feed_dict = {self.model.state_in: info.states, self.model.batch_size: len(info.states)}
actions, a_dist, value, ent = self.sess.run([self.model.output, self.model.probs,
self.model.value, self.model.entropy],
feed_dict=feed_dict)
actions, a_dist, value, ent, learn_rate = self.sess.run([self.model.output, self.model.probs,
self.model.value, self.model.entropy,
self.model.learning_rate],
feed_dict=feed_dict)
self.stats['learning_rate'].append(learn_rate)
new_info = env.step(actions, value={brain_name: value})[brain_name]
self.add_experiences(info, new_info, epsi, actions, a_dist, value)
return new_info

2
python/setup.py


required = f.read().splitlines()
setup(name='unityagents',
version='0.1',
version='0.1.1',
description='Unity Machine Learning Agents',
license='Apache License 2.0',
author='Unity Technologies',

正在加载...
取消
保存