浏览代码

Additional Tests & Bug Fixes (#854)

* Add tests and fix for sparse tensor warning
* Rename mock communicator parameter
* Test longer sequences
* Curiosity tests and bug fixes
/develop-generalizationTraining-TrainerController
GitHub 7 年前
当前提交
47fc38ab
共有 8 个文件被更改,包括 274 次插入45 次删除
  1. 4
      python/tests/mock_communicator.py
  2. 8
      python/tests/test_bc.py
  3. 234
      python/tests/test_ppo.py
  4. 8
      python/tests/test_unityagents.py
  5. 6
      python/tests/test_unitytrainers.py
  6. 2
      python/unitytrainers/models.py
  7. 45
      python/unitytrainers/ppo/models.py
  8. 12
      python/unitytrainers/ppo/trainer.py

4
python/tests/mock_communicator.py


class MockCommunicator(Communicator):
def __init__(self, discrete=False, visual_inputs=0):
def __init__(self, discrete_action=False, visual_inputs=0):
"""
Python side of the grpc communication. Python is the client and Unity the server

self.is_discrete = discrete
self.is_discrete = discrete_action
self.steps = 0
self.visual_inputs = visual_inputs
self.has_been_closed = False

8
python/tests/test_bc.py


with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
model = BehavioralCloningModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()

with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete=True, visual_inputs=0)
discrete_action=True, visual_inputs=0)
env = UnityEnvironment(' ')
model = BehavioralCloningModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()

with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete=True, visual_inputs=2)
discrete_action=True, visual_inputs=2)
env = UnityEnvironment(' ')
model = BehavioralCloningModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()

with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete=False, visual_inputs=2)
discrete_action=False, visual_inputs=2)
env = UnityEnvironment(' ')
model = BehavioralCloningModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()

234
python/tests/test_ppo.py


import tensorflow as tf
from unitytrainers.ppo.models import PPOModel
from unitytrainers.ppo.trainer import discount_rewards
from unityagents import UnityEnvironment
from .mock_communicator import MockCommunicator

def test_ppo_model_continuous(mock_communicator, mock_launcher):
def test_ppo_model_cc_vector(mock_communicator, mock_launcher):
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"])

feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]])}
[3, 4, 5, 3, 4, 5]])}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_cc_visual(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=2)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.probs, model.value, model.entropy,
model.learning_rate]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3])}
sess.run(run_list, feed_dict=feed_dict)
env.close()

def test_ppo_model_discrete(mock_communicator, mock_launcher):
def test_ppo_model_dc_visual(mock_communicator, mock_launcher):
discrete=True, visual_inputs=2)
discrete_action=True, visual_inputs=2)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()

feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
[3, 4, 5, 3, 4, 5]]),
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_dc_vector(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=True, visual_inputs=0)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"])
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]])}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_dc_vector_rnn(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=True, visual_inputs=0)
env = UnityEnvironment(' ')
memory_size = 128
model = PPOModel(env.brains["RealFakeBrain"], use_recurrent=True, m_size=memory_size)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.memory_out]
feed_dict = {model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [0, 0],
model.memory_in: np.zeros((1, memory_size)),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]])}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_cc_vector_rnn(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
memory_size = 128
model = PPOModel(env.brains["RealFakeBrain"], use_recurrent=True, m_size=memory_size)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.memory_out]
feed_dict = {model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size)),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]])}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_dc_vector_curio(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=True, visual_inputs=0)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.intrinsic_reward]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.action_holder: [0, 0]}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_cc_vector_curio(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.intrinsic_reward]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.output: [[0.0, 0.0], [0.0, 0.0]]}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_dc_visual_curio(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=True, visual_inputs=2)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.intrinsic_reward]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.action_holder: [0, 0],
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.next_visual_in[0]: np.ones([2, 40, 30, 3]),
model.next_visual_in[1]: np.ones([2, 40, 30, 3])
}
sess.run(run_list, feed_dict=feed_dict)
env.close()
@mock.patch('unityagents.UnityEnvironment.executable_launcher')
@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_ppo_model_cc_visual_curio(mock_communicator, mock_launcher):
tf.reset_default_graph()
with tf.Session() as sess:
with tf.variable_scope("FakeGraphScope"):
mock_communicator.return_value = MockCommunicator(
discrete_action=False, visual_inputs=2)
env = UnityEnvironment(' ')
model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True)
init = tf.global_variables_initializer()
sess.run(init)
run_list = [model.output, model.all_probs, model.value, model.entropy,
model.learning_rate, model.intrinsic_reward]
feed_dict = {model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3],
[3, 4, 5, 3, 4, 5]]),
model.output: [[0.0, 0.0], [0.0, 0.0]],
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.next_visual_in[0]: np.ones([2, 40, 30, 3]),
model.next_visual_in[1]: np.ones([2, 40, 30, 3])
}
sess.run(run_list, feed_dict=feed_dict)
env.close()
def test_rl_functions():
rewards = np.array([0.0, 0.0, 0.0, 1.0])
gamma = 0.9
returns = discount_rewards(rewards, gamma, 0.0)
np.testing.assert_array_almost_equal(returns, np.array([0.729, 0.81, 0.9, 1.0]))
if __name__ == '__main__':

8
python/tests/test_unityagents.py


@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_initialization(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator(
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
with pytest.raises(UnityActionException):
env.step([0])

@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_reset(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator(
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
brain_info = env.reset()

@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_step(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator(
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
env = UnityEnvironment(' ')
brain = env.brains['RealFakeBrain']
brain_info = env.reset()

@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_close(mock_communicator, mock_launcher):
comm = MockCommunicator(
discrete=False, visual_inputs=0)
discrete_action=False, visual_inputs=0)
mock_communicator.return_value = comm
env = UnityEnvironment(' ')
assert env._loaded

6
python/tests/test_unitytrainers.py


@mock.patch('unityagents.UnityEnvironment.get_communicator')
def test_initialization(mock_communicator, mock_launcher):
mock_communicator.return_value = MockCommunicator(
discrete=True, visual_inputs=1)
discrete_action=True, visual_inputs=1)
tc = TrainerController(' ', ' ', 1, None, True, True, False, 1,
1, 1, 1, '', "tests/test_unitytrainers.py", False)
assert(tc.env.brain_names[0] == 'RealFakeBrain')

with mock.patch(open_name, create=True) as _:
mock_load.return_value = dummy_config
mock_communicator.return_value = MockCommunicator(
discrete=True, visual_inputs=1)
discrete_action=True, visual_inputs=1)
mock_load.return_value = dummy_config
tc = TrainerController(' ', ' ', 1, None, True, True, False, 1,
1, 1, 1, '','', False)

with mock.patch('yaml.load') as mock_load:
with mock.patch(open_name, create=True) as _:
mock_communicator.return_value = MockCommunicator(
discrete=True, visual_inputs=1)
discrete_action=True, visual_inputs=1)
tc = TrainerController(' ', ' ', 1, None, True, True, False, 1,
1, 1, 1, '', "tests/test_unitytrainers.py", False)

2
python/unitytrainers/models.py


self.visual_in = []
self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name='batch_size')
self.sequence_length = tf.placeholder(shape=None, dtype=tf.int32, name='sequence_length')
self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name='masks')
self.mask = tf.cast(self.mask_input, tf.int32)
self.m_size = m_size
self.normalize = normalize
self.use_recurrent = use_recurrent

45
python/unitytrainers/ppo/models.py


# Create the encoder ops for current and next visual input. Not that these encoders are siamese.
encoded_visual = self.create_visual_observation_encoder(self.visual_in[i], self.curiosity_enc_size,
self.swish, 1, "visual_obs_encoder", False)
encoded_next_visual = self.create_visual_observation_encoder(self.next_visual_in[i], self.curiosity_enc_size,
self.swish, 1, "visual_obs_encoder", True)
self.swish, 1, "stream_{}_visual_obs_encoder"
.format(i), False)
encoded_next_visual = self.create_visual_observation_encoder(self.next_visual_in[i],
self.curiosity_enc_size,
self.swish, 1,
"stream_{}_visual_obs_encoder".format(i),
True)
visual_encoders.append(encoded_visual)
next_visual_encoders.append(encoded_next_visual)

if self.o_size > 0:
# Create input op for next (t+1) vector observation.
self.next_vector_obs = tf.placeholder(shape=[None, self.o_size], dtype=tf.float32,
name='next_vector_observation')
self.next_vector_in = tf.placeholder(shape=[None, self.o_size], dtype=tf.float32,
name='next_vector_observation')
encoded_vector_obs = self.create_continuous_observation_encoder(self.vector_in, self.curiosity_enc_size,
encoded_vector_obs = self.create_continuous_observation_encoder(self.vector_in,
self.curiosity_enc_size,
encoded_next_vector_obs = self.create_continuous_observation_encoder(self.next_vector_obs,
self.curiosity_enc_size, self.swish,
2, "vector_obs_encoder", True)
encoded_next_vector_obs = self.create_continuous_observation_encoder(self.next_vector_in,
self.curiosity_enc_size,
self.swish, 2, "vector_obs_encoder",
True)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)

if self.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(hidden, self.a_size, activation=None)
squared_difference = tf.reduce_sum(tf.squared_difference(pred_action, self.selected_actions), axis=1)
self.inverse_loss = tf.reduce_mean(squared_difference)
self.inverse_loss = tf.reduce_mean(tf.dynamic_partition(squared_difference, self.mask, 2)[1])
self.inverse_loss = tf.reduce_mean(cross_entropy)
self.inverse_loss = tf.reduce_mean(tf.dynamic_partition(cross_entropy, self.mask, 2)[1])
def create_forward_model(self, encoded_state, encoded_next_state):
"""

"""
combined_input = tf.concat([encoded_state, self.selected_actions], axis=1)
hidden = tf.layers.dense(combined_input, 256, activation=self.swish)
pred_next_state = tf.layers.dense(hidden, self.curiosity_enc_size, activation=None)
# We compare against the concatenation of all observation streams, hence `self.v_size+1`.
pred_next_state = tf.layers.dense(hidden, self.curiosity_enc_size * (self.v_size+1), activation=None)
self.forward_loss = tf.reduce_mean(squared_difference)
self.forward_loss = tf.reduce_mean(tf.dynamic_partition(squared_difference, self.mask, 2)[1])
def create_ppo_optimizer(self, probs, old_probs, value, entropy, beta, epsilon, lr, max_step):
"""

self.learning_rate = tf.train.polynomial_decay(lr, self.global_step, max_step, 1e-10, power=1.0)
self.old_value = tf.placeholder(shape=[None], dtype=tf.float32, name='old_value_estimates')
self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name='masks')
mask = tf.equal(self.mask_input, 1.0)
clipped_value_estimate = self.old_value + tf.clip_by_value(tf.reduce_sum(value, axis=1) - self.old_value,
- decay_epsilon, decay_epsilon)

self.value_loss = tf.reduce_mean(tf.boolean_mask(tf.maximum(v_opt_a, v_opt_b), mask))
self.value_loss = tf.reduce_mean(tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.mask, 2)[1])
# Here we calculate PPO policy loss. In continuous control this is done independently for each action gaussian
# and then averaged together. This provides significantly better performance than treating the probability

p_opt_b = tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * self.advantage
self.policy_loss = -tf.reduce_mean(tf.boolean_mask(tf.minimum(p_opt_a, p_opt_b), mask))
self.policy_loss = -tf.reduce_mean(tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.mask, 2)[1])
tf.boolean_mask(entropy, mask))
tf.dynamic_partition(entropy, self.mask, 2)[1])
if self.use_curiosity:
self.loss += 10 * (0.2 * self.forward_loss + 0.8 * self.inverse_loss)
self.update_batch = optimizer.minimize(self.loss)

12
python/unitytrainers/ppo/trainer.py


# # Unity ML Agents
# ## ML-Agent Learning (PPO)
# Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).
# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).
import logging
import os

feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.use_vector_obs:
feed_dict[self.model.vector_in] = curr_info.vector_observations
feed_dict[self.model.next_vector_obs] = next_info.vector_observations
feed_dict[self.model.next_vector_in] = next_info.vector_observations
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward,
feed_dict=feed_dict) * float(self.has_updated)

next_info.visual_observations[i][idx])
if self.use_vector_obs:
self.training_buffer[agent_id]['vector_obs'].append(stored_info.vector_observations[idx])
self.training_buffer[agent_id]['next_vector_obs'].append(
self.training_buffer[agent_id]['next_vector_in'].append(
next_info.vector_observations[next_idx])
if self.use_recurrent:
if stored_info.memories.shape[1] == 0:

[-1, self.brain.vector_action_space_size])}
if self.is_continuous_action:
feed_dict[self.model.output_pre] = np.array(buffer['actions_pre'][start:end]).reshape(
[-1, self.brain.vector_action_space_size])
[-1, self.brain.vector_action_space_size])
else:
feed_dict[self.model.action_holder] = np.array(buffer['actions'][start:end]).flatten()
if self.use_recurrent:

feed_dict[self.model.vector_in] = np.array(buffer['vector_obs'][start:end]).reshape(
[-1, total_observation_length])
if self.use_curiosity:
feed_dict[self.model.next_vector_obs] = np.array(buffer['next_vector_obs'][start:end])\
feed_dict[self.model.next_vector_in] = np.array(buffer['next_vector_in'][start:end]) \
[-1, self.brain.num_stacked_vector_observations])
[-1, self.brain.num_stacked_vector_observations])
if self.use_visual_obs:
for i, _ in enumerate(self.model.visual_in):
_obs = np.array(buffer['visual_obs%d' % i][start:end])

正在加载...
取消
保存