|
|
|
|
|
|
self.training_buffer = Buffer() |
|
|
|
self.cumulative_rewards = {} |
|
|
|
self.episode_steps = {} |
|
|
|
self.is_continuous = (env.brains[brain_name].vector_action_space_type == "continuous") |
|
|
|
self.is_continuous_action = (env.brains[brain_name].vector_action_space_type == "continuous") |
|
|
|
self.is_continuous_observation = (env.brains[brain_name].vector_observation_space_type == "continuous") |
|
|
|
self.use_observations = (env.brains[brain_name].number_visual_observations > 0) |
|
|
|
self.use_states = (env.brains[brain_name].vector_observation_space_size > 0) |
|
|
|
self.summary_path = trainer_parameters['summary_path'] |
|
|
|
|
|
|
feed_dict = {self.model.batch_size: len(curr_brain_info.vector_observations), self.model.sequence_length: 1} |
|
|
|
run_list = [self.model.output, self.model.all_probs, self.model.value, self.model.entropy, |
|
|
|
self.model.learning_rate] |
|
|
|
if self.is_continuous: |
|
|
|
if self.is_continuous_action: |
|
|
|
run_list.append(self.model.epsilon) |
|
|
|
elif self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = np.reshape(curr_brain_info.previous_vector_actions, [-1]) |
|
|
|
|
|
|
curr_brain_info.memories = np.zeros((len(curr_brain_info.agents), self.m_size)) |
|
|
|
feed_dict[self.model.memory_in] = curr_brain_info.memories |
|
|
|
run_list += [self.model.memory_out] |
|
|
|
if (self.is_training and self.brain.vector_observation_space_type == "continuous" and |
|
|
|
if (self.is_training and self.is_continuous_observation and |
|
|
|
self.use_states and self.trainer_parameters['normalize']): |
|
|
|
new_mean, new_variance = self.running_average( |
|
|
|
curr_brain_info.vector_observations, steps, self.model.running_mean, self.model.running_variance) |
|
|
|
|
|
|
if stored_info.memories.shape[1] == 0: |
|
|
|
stored_info.memories = np.zeros((len(stored_info.agents), self.m_size)) |
|
|
|
self.training_buffer[agent_id]['memory'].append(stored_info.memories[idx]) |
|
|
|
if self.is_continuous: |
|
|
|
if self.is_continuous_action: |
|
|
|
epsi = stored_take_action_outputs[self.model.epsilon] |
|
|
|
self.training_buffer[agent_id]['epsilons'].append(epsi[idx]) |
|
|
|
actions = stored_take_action_outputs[self.model.output] |
|
|
|
|
|
|
if info.memories.shape[1] == 0: |
|
|
|
info.memories = np.zeros((len(info.vector_observations), self.m_size)) |
|
|
|
feed_dict[self.model.memory_in] = info.memories |
|
|
|
if not self.is_continuous and self.use_recurrent: |
|
|
|
if not self.is_continuous_action and self.use_recurrent: |
|
|
|
feed_dict[self.model.prev_action] = np.reshape(info.previous_vector_actions, [-1]) |
|
|
|
value_next = self.sess.run(self.model.value, feed_dict)[l] |
|
|
|
agent_id = info.agents[l] |
|
|
|
|
|
|
self.model.advantage: np.array(_buffer['advantages'][start:end]).reshape([-1, 1]), |
|
|
|
self.model.all_old_probs: np.array( |
|
|
|
_buffer['action_probs'][start:end]).reshape([-1, self.brain.vector_action_space_size])} |
|
|
|
if self.is_continuous: |
|
|
|
if self.is_continuous_action: |
|
|
|
feed_dict[self.model.epsilon] = np.array( |
|
|
|
_buffer['epsilons'][start:end]).reshape([-1, self.brain.vector_action_space_size]) |
|
|
|
else: |
|
|
|
|
|
|
feed_dict[self.model.prev_action] = np.array( |
|
|
|
_buffer['prev_action'][start:end]).reshape([-1]) |
|
|
|
if self.use_states: |
|
|
|
if self.brain.vector_observation_space_type == "continuous": |
|
|
|
if self.is_continuous_observation: |
|
|
|
feed_dict[self.model.vector_in] = np.array( |
|
|
|
_buffer['states'][start:end]).reshape( |
|
|
|
[-1, self.brain.vector_observation_space_size * self.brain.num_stacked_vector_observations]) |
|
|
|