浏览代码

Use switch between old and new behavior

/develop-generalizationTraining-TrainerController
Arthur Juliani 6 年前
当前提交
0c6411c2
共有 1 个文件被更改,包括 55 次插入40 次删除
  1. 95
      python/unitytrainers/ppo/trainer.py

95
python/unitytrainers/ppo/trainer.py


else:
return run_out[self.model.output], None, None, run_out
def generate_intrinsic_rewards(self, next_info):
def generate_intrinsic_rewards(self, curr_info, next_info):
"""
Generates intrinsic reward used for Curiosity-based training.
:param next_info: Next BrainInfo.

agent_index_to_ignore = []
for agent_index, agent_id in enumerate(next_info.agents):
if self.training_buffer[agent_id].last_brain_info is None:
agent_index_to_ignore.append(agent_index)
if self.use_visual_obs:
for i in range(len(next_info.visual_observations)):
if curr_info.agents == next_info.agents:
if self.use_visual_obs:
for i in range(len(curr_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i]
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.use_vector_obs:
feed_dict[self.model.vector_in] = curr_info.vector_observations
feed_dict[self.model.next_vector_in] = next_info.vector_observations
if self.use_recurrent:
if curr_info.memories.shape[1] == 0:
curr_info.memories = np.zeros((len(curr_info.agents), self.m_size))
feed_dict[self.model.memory_in] = curr_info.memories
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward,
feed_dict=feed_dict) * float(self.has_updated)
return intrinsic_rewards
else:
agent_index_to_ignore = []
for agent_index, agent_id in enumerate(next_info.agents):
if self.training_buffer[agent_id].last_brain_info is None:
agent_index_to_ignore.append(agent_index)
if self.use_visual_obs:
for i in range(len(next_info.visual_observations)):
tmp = []
for agent_id in next_info.agents:
agent_brain_info = self.training_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
agent_obs = agent_brain_info.visual_observations[i][agent_brain_info.agents.index(agent_id)]
tmp += [agent_obs]
feed_dict[self.model.visual_in[i]] = np.array(tmp)
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.use_vector_obs:
tmp = []
for agent_id in next_info.agents:
agent_brain_info = self.training_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
agent_obs = agent_brain_info.vector_observations[agent_brain_info.agents.index(agent_id)]
tmp += [agent_obs]
feed_dict[self.model.vector_in] = np.array(tmp)
feed_dict[self.model.next_vector_in] = next_info.vector_observations
if self.use_recurrent:
agent_obs = agent_brain_info.visual_observations[i][agent_brain_info.agents.index(agent_id)]
if agent_brain_info.memories.shape[1] == 0:
agent_obs = np.zeros(self.m_size)
else:
agent_obs = agent_brain_info.memories[agent_brain_info.agents.index(agent_id)]
feed_dict[self.model.visual_in[i]] = np.array(tmp)
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
if self.use_vector_obs:
tmp = []
for agent_id in next_info.agents:
agent_brain_info = self.training_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
agent_obs = agent_brain_info.vector_observations[agent_brain_info.agents.index(agent_id)]
tmp += [agent_obs]
feed_dict[self.model.vector_in] = np.array(tmp)
feed_dict[self.model.next_vector_in] = next_info.vector_observations
if self.use_recurrent:
tmp = []
for agent_id in next_info.agents:
agent_brain_info = self.training_buffer[agent_id].last_brain_info
if agent_brain_info is None:
agent_brain_info = next_info
if agent_brain_info.memories.shape[1] == 0:
agent_obs = np.zeros(self.m_size)
else:
agent_obs = agent_brain_info.memories[agent_brain_info.agents.index(agent_id)]
tmp += [agent_obs]
feed_dict[self.model.memory_in] = np.array(tmp)
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward,
feed_dict=feed_dict) * float(self.has_updated)
for index in agent_index_to_ignore:
intrinsic_rewards[index] = 0
return intrinsic_rewards
feed_dict[self.model.memory_in] = np.array(tmp)
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward,
feed_dict=feed_dict) * float(self.has_updated)
for index in agent_index_to_ignore:
intrinsic_rewards[index] = 0
return intrinsic_rewards
else:
return None

curr_info = curr_all_info[self.brain_name]
next_info = next_all_info[self.brain_name]
# intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info)
intrinsic_rewards = self.generate_intrinsic_rewards(next_info)
intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info)
for agent_id in next_info.agents:
stored_info = self.training_buffer[agent_id].last_brain_info

正在加载...
取消
保存