|
|
|
|
|
|
from unityagents import UnityEnvironmentException |
|
|
|
|
|
|
|
|
|
|
|
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6): |
|
|
|
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, num_layers=2): |
|
|
|
""" |
|
|
|
Takes a Unity environment and model-specific hyper-parameters and returns the |
|
|
|
appropriate PPO agent model for the environment. |
|
|
|
|
|
|
:return: a sub-class of PPOAgent tailored to the environment. |
|
|
|
:param max_step: Total number of training steps. |
|
|
|
""" |
|
|
|
if num_layers < 1: num_layers = 1 |
|
|
|
|
|
|
|
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step) |
|
|
|
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, num_layers) |
|
|
|
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step) |
|
|
|
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers) |
|
|
|
|
|
|
|
|
|
|
|
def save_model(sess, saver, model_path="./", steps=0): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PPOModel(object): |
|
|
|
def __init__(self): |
|
|
|
self.normalize = False |
|
|
|
|
|
|
|
def create_global_steps(self): |
|
|
|
"""Creates TF ops to track and increment global training step.""" |
|
|
|
self.global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32) |
|
|
|
|
|
|
self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward') |
|
|
|
self.update_reward = tf.assign(self.last_reward, self.new_reward) |
|
|
|
|
|
|
|
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation): |
|
|
|
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation, num_layers): |
|
|
|
""" |
|
|
|
Builds a set of visual (CNN) encoders. |
|
|
|
:param o_size_h: Height observation size. |
|
|
|
|
|
|
use_bias=False, activation=activation) |
|
|
|
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2], |
|
|
|
use_bias=False, activation=activation) |
|
|
|
hidden = tf.layers.dense(c_layers.flatten(self.conv2), h_size, use_bias=False, activation=activation) |
|
|
|
hidden = c_layers.flatten(self.conv2) |
|
|
|
for j in range(num_layers): |
|
|
|
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation) |
|
|
|
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation): |
|
|
|
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers): |
|
|
|
""" |
|
|
|
Builds a set of hidden state encoders. |
|
|
|
:param s_size: state input size. |
|
|
|
|
|
|
""" |
|
|
|
self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state') |
|
|
|
|
|
|
|
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32, |
|
|
|
initializer=tf.zeros_initializer()) |
|
|
|
self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32, |
|
|
|
initializer=tf.ones_initializer()) |
|
|
|
if self.normalize: |
|
|
|
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32, |
|
|
|
initializer=tf.zeros_initializer()) |
|
|
|
self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32, |
|
|
|
initializer=tf.ones_initializer()) |
|
|
|
self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt( |
|
|
|
self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state") |
|
|
|
self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt( |
|
|
|
self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state") |
|
|
|
self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean') |
|
|
|
self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance') |
|
|
|
self.update_mean = tf.assign(self.running_mean, self.new_mean) |
|
|
|
self.update_variance = tf.assign(self.running_variance, self.new_variance) |
|
|
|
|
|
|
|
self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean') |
|
|
|
self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance') |
|
|
|
self.update_mean = tf.assign(self.running_mean, self.new_mean) |
|
|
|
self.update_variance = tf.assign(self.running_variance, self.new_variance) |
|
|
|
else: |
|
|
|
self.normalized_state = self.state_in |
|
|
|
hidden_1 = tf.layers.dense(self.normalized_state, h_size, use_bias=False, activation=activation) |
|
|
|
hidden_2 = tf.layers.dense(hidden_1, h_size, use_bias=False, activation=activation) |
|
|
|
streams.append(hidden_2) |
|
|
|
hidden = self.normalized_state |
|
|
|
for j in range(num_layers): |
|
|
|
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation) |
|
|
|
streams.append(hidden) |
|
|
|
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation): |
|
|
|
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation, num_layers): |
|
|
|
""" |
|
|
|
Builds a set of hidden state encoders from discrete state input. |
|
|
|
:param s_size: state input size (discrete). |
|
|
|
|
|
|
state_in = tf.reshape(self.state_in, [-1]) |
|
|
|
state_onehot = c_layers.one_hot_encoding(state_in, s_size) |
|
|
|
streams = [] |
|
|
|
hidden = state_onehot |
|
|
|
hidden = tf.layers.dense(state_onehot, h_size, use_bias=False, activation=activation) |
|
|
|
for j in range(num_layers): |
|
|
|
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation) |
|
|
|
streams.append(hidden) |
|
|
|
return streams |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ContinuousControlModel(PPOModel): |
|
|
|
def __init__(self, lr, brain, h_size, epsilon, max_step): |
|
|
|
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers): |
|
|
|
super().__init__() |
|
|
|
self.normalize = normalize |
|
|
|
self.create_global_steps() |
|
|
|
self.create_reward_encoder() |
|
|
|
|
|
|
|
|
|
|
bw = brain.camera_resolutions[0]['blackAndWhite'] |
|
|
|
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh) |
|
|
|
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers) |
|
|
|
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh) |
|
|
|
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers) |
|
|
|
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh) |
|
|
|
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers) |
|
|
|
|
|
|
|
if hidden_visual is None and hidden_state is None: |
|
|
|
raise Exception("No valid network configuration possible. " |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DiscreteControlModel(PPOModel): |
|
|
|
def __init__(self, lr, brain, h_size, epsilon, beta, max_step): |
|
|
|
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers): |
|
|
|
super().__init__() |
|
|
|
self.normalize = normalize |
|
|
|
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu)[0] |
|
|
|
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0] |
|
|
|
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu)[0] |
|
|
|
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0] |
|
|
|
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu)[0] |
|
|
|
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0] |
|
|
|
|
|
|
|
if hidden_visual is None and hidden_state is None: |
|
|
|
raise Exception("No valid network configuration possible. " |
|
|
|