浏览代码

Initial support for multiple observations (#256)

* Initial support for multiple observations

* Fix PPO for continuous control
/tag-0.2.1
Arthur Juliani 7 年前
当前提交
adedd491
共有 3 个文件被更改,包括 38 次插入16 次删除
  1. 15
      python/ppo/history.py
  2. 27
      python/ppo/models.py
  3. 12
      python/ppo/trainer.py

15
python/ppo/history.py


import numpy as np
history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
'value_estimates', 'advantages', 'discounted_returns']

"""
for key in history_keys:
agent_dict[key] = []
for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict['observations%d' % i] = []
return agent_dict

:return: dictionary of numpy arrays.
"""
for key in history_keys:
agent_dict[key] = np.array(agent_dict[key])
for key in (key for key in agent_dict.keys() if key.startswith('observations')):
agent_dict[key] = np.array(agent_dict[key])
return agent_dict

history_dict[agent] = empty_local_history(history_dict[agent])
history_dict[agent]['cumulative_reward'] = 0
history_dict[agent]['episode_steps'] = 0
for i, _ in enumerate(agent_info.observations):
history_dict[agent]['observations%d' % i] = []
return history_dict

"""
for key in history_keys:
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
return global_buffer

"""
for key in history_keys:
global_buffer[key] = np.copy(local_buffer[key])
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
global_buffer[key] = np.array(local_buffer[key])
return global_buffer

s = np.arange(global_buffer[history_keys[2]].shape[0])
np.random.shuffle(s)
for key in history_keys:
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
for key in (key for key in global_buffer.keys() if key.startswith('observations')):
if len(global_buffer[key]) > 0:
global_buffer[key] = global_buffer[key][s]
return global_buffer

27
python/ppo/models.py


class PPOModel(object):
def __init__(self):
self.normalize = False
self.observation_in = []
def create_global_steps(self):
"""Creates TF ops to track and increment global training step."""

else:
c_channels = 3
self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_0')
self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
name='observation_%d' % len(self.observation_in)))
self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
use_bias=False, activation=activation)
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
use_bias=False, activation=activation)

self.create_reward_encoder()
hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
hidden_visual = tf.concat(encoders, axis=2)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":

self.normalize = normalize
hidden_state, hidden_visual, hidden = None, None, None
if brain.number_observations > 0:
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
bw = brain.camera_resolutions[0]['blackAndWhite']
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
encoders = []
for i in range(brain.number_observations):
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
bw = brain.camera_resolutions[i]['blackAndWhite']
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
hidden_visual = tf.concat(encoders, axis=1)
if brain.state_space_size > 0:
s_size = brain.state_space_size
if brain.state_space_type == "continuous":

12
python/ppo/trainer.py


epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
feed_dict[self.model.epsilon] = epsi
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i, _ in enumerate(info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:

idx = info.agents.index(agent)
if not info.local_done[idx]:
if self.use_observations:
history['observations'].append([info.observations[0][idx]])
for i, _ in enumerate(info.observations):
history['observations%d' % i].append([info.observations[i][idx]])
if self.use_states:
history['states'].append(info.states[idx])
if self.is_continuous:

else:
feed_dict = {self.model.batch_size: len(info.states)}
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(info.observations)
for i in range(self.info.observations):
feed_dict[self.model.observation_in[i]] = info.observations[i]
if self.use_states:
feed_dict[self.model.state_in] = info.states
value_next = self.sess.run(self.model.value, feed_dict)[l]

if self.use_states:
feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
if self.use_observations:
feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
for i, _ in enumerate(self.model.observation_in):
feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
self.model.update_batch], feed_dict=feed_dict)
total_v += v_loss

正在加载...
取消
保存