Initial support for multiple observations (#256)

* Initial support for multiple observations * Fix PPO for continuous control
7 年前 · adedd491
--- a/python/ppo/history.py
+++ b/python/ppo/history.py
 import numpy as np

-history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
+history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
                'value_estimates', 'advantages', 'discounted_returns']


    """
    for key in history_keys:
        agent_dict[key] = []
+    for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
+        agent_dict['observations%d' % i] = []
    return agent_dict


    :return: dictionary of numpy arrays.
    """
    for key in history_keys:
+        agent_dict[key] = np.array(agent_dict[key])
+    for key in (key for key in agent_dict.keys() if key.startswith('observations')):
        agent_dict[key] = np.array(agent_dict[key])
    return agent_dict

        history_dict[agent] = empty_local_history(history_dict[agent])
        history_dict[agent]['cumulative_reward'] = 0
        history_dict[agent]['episode_steps'] = 0
+        for i, _ in enumerate(agent_info.observations):
+            history_dict[agent]['observations%d' % i] = []
    return history_dict


    """
    for key in history_keys:
        global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
+    for key in (key for key in local_buffer.keys() if key.startswith('observations')):
+        global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
    return global_buffer


    """
    for key in history_keys:
        global_buffer[key] = np.copy(local_buffer[key])
+    for key in (key for key in local_buffer.keys() if key.startswith('observations')):
+        global_buffer[key] = np.array(local_buffer[key])
    return global_buffer


    s = np.arange(global_buffer[history_keys[2]].shape[0])
    np.random.shuffle(s)
    for key in history_keys:
+        if len(global_buffer[key]) > 0:
+            global_buffer[key] = global_buffer[key][s]
+    for key in (key for key in global_buffer.keys() if key.startswith('observations')):
        if len(global_buffer[key]) > 0:
            global_buffer[key] = global_buffer[key][s]
    return global_buffer
--- a/python/ppo/models.py
+++ b/python/ppo/models.py
 class PPOModel(object):
    def __init__(self):
        self.normalize = False
+        self.observation_in = []

    def create_global_steps(self):
        """Creates TF ops to track and increment global training step."""
        else:
            c_channels = 3

-        self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
-                                             name='observation_0')
+        self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
+                                             name='observation_%d' % len(self.observation_in)))
-            self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
+            self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
                                          use_bias=False, activation=activation)
            self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
                                          use_bias=False, activation=activation)
        self.create_reward_encoder()

        hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
-        if brain.number_observations > 0:
-            height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
-            bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
+        encoders = []
+        for i in range(brain.number_observations):
+            height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
+            bw = brain.camera_resolutions[i]['blackAndWhite']
+            encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
+        hidden_visual = tf.concat(encoders, axis=2)
        if brain.state_space_size > 0:
            s_size = brain.state_space_size
            if brain.state_space_type == "continuous":
        self.normalize = normalize

        hidden_state, hidden_visual, hidden = None, None, None
-        if brain.number_observations > 0:
-            height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
-            bw = brain.camera_resolutions[0]['blackAndWhite']
-            hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
+        encoders = []
+        for i in range(brain.number_observations):
+            height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
+            bw = brain.camera_resolutions[i]['blackAndWhite']
+            encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
+        hidden_visual = tf.concat(encoders, axis=1)
        if brain.state_space_size > 0:
            s_size = brain.state_space_size
            if brain.state_space_type == "continuous":
--- a/python/ppo/trainer.py
+++ b/python/ppo/trainer.py
            epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
            feed_dict[self.model.epsilon] = epsi
        if self.use_observations:
-            feed_dict[self.model.observation_in] = np.vstack(info.observations)
+            for i, _ in enumerate(info.observations):
+                feed_dict[self.model.observation_in[i]] = info.observations[i]
        if self.use_states:
            feed_dict[self.model.state_in] = info.states
        if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
                idx = info.agents.index(agent)
                if not info.local_done[idx]:
                    if self.use_observations:
-                        history['observations'].append([info.observations[0][idx]])
+                        for i, _ in enumerate(info.observations):
+                            history['observations%d' % i].append([info.observations[i][idx]])
                    if self.use_states:
                        history['states'].append(info.states[idx])
                    if self.is_continuous:
                else:
                    feed_dict = {self.model.batch_size: len(info.states)}
                    if self.use_observations:
-                        feed_dict[self.model.observation_in] = np.vstack(info.observations)
+                        for i in range(self.info.observations):
+                            feed_dict[self.model.observation_in[i]] = info.observations[i]
                    if self.use_states:
                        feed_dict[self.model.state_in] = info.states
                    value_next = self.sess.run(self.model.value, feed_dict)[l]
                if self.use_states:
                    feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
                if self.use_observations:
-                    feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
+                    for i, _ in enumerate(self.model.observation_in):
+                        feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
                v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
                                                   self.model.update_batch], feed_dict=feed_dict)
                total_v += v_loss