|
|
|
|
|
|
import numpy as np |
|
|
|
import tensorflow as tf |
|
|
|
|
|
|
|
from unityagents import AllBrainInfo |
|
|
|
from unityagents import AllBrainInfo, BrainInfo |
|
|
|
from unitytrainers.buffer import Buffer |
|
|
|
from unitytrainers.ppo.models import PPOModel |
|
|
|
from unitytrainers.trainer import UnityTrainerException, Trainer |
|
|
|
|
|
|
else: |
|
|
|
return run_out[self.model.output], None, None, run_out |
|
|
|
|
|
|
|
def construct_curr_info(self, next_info: BrainInfo) -> BrainInfo: |
|
|
|
""" |
|
|
|
Constructs a BrainInfo which contains the most recent previous experiences for all agents info |
|
|
|
which correspond to the agents in a provided next_info. |
|
|
|
:BrainInfo next_info: A t+1 BrainInfo. |
|
|
|
:return: curr_info: Reconstructed BrainInfo to match agents of next_info. |
|
|
|
""" |
|
|
|
visual_observations = [[]] |
|
|
|
vector_observations = [] |
|
|
|
text_observations = [] |
|
|
|
memories = [] |
|
|
|
rewards = [] |
|
|
|
local_dones = [] |
|
|
|
max_reacheds = [] |
|
|
|
agents = [] |
|
|
|
prev_vector_actions = [] |
|
|
|
prev_text_actions = [] |
|
|
|
for agent_id in next_info.agents: |
|
|
|
agent_brain_info = self.training_buffer[agent_id].last_brain_info |
|
|
|
agent_index = agent_brain_info.agents.index(agent_id) |
|
|
|
if agent_brain_info is None: |
|
|
|
agent_brain_info = next_info |
|
|
|
for i in range(len(next_info.visual_observations)): |
|
|
|
visual_observations[i].append(agent_brain_info.visual_observations[i][agent_index]) |
|
|
|
vector_observations.append(agent_brain_info.vector_observations[agent_index]) |
|
|
|
text_observations.append(agent_brain_info.text_observations[agent_index]) |
|
|
|
if self.use_recurrent: |
|
|
|
memories.append(agent_brain_info.memories[agent_index]) |
|
|
|
rewards.append(agent_brain_info.rewards[agent_index]) |
|
|
|
local_dones.append(agent_brain_info.local_done[agent_index]) |
|
|
|
max_reacheds.append(agent_brain_info.max_reached[agent_index]) |
|
|
|
agents.append(agent_brain_info.agents[agent_index]) |
|
|
|
prev_vector_actions.append(agent_brain_info.previous_vector_actions[agent_index]) |
|
|
|
prev_text_actions.append(agent_brain_info.previous_text_actions[agent_index]) |
|
|
|
curr_info = BrainInfo(visual_observations, vector_observations, text_observations, memories, rewards, |
|
|
|
agents, local_dones, prev_vector_actions, prev_text_actions, max_reacheds) |
|
|
|
return curr_info |
|
|
|
|
|
|
|
:param curr_info: Current BrainInfo. |
|
|
|
:param next_info: Next BrainInfo. |
|
|
|
:BrainInfo curr_info: Current BrainInfo. |
|
|
|
:BrainInfo next_info: Next BrainInfo. |
|
|
|
if curr_info.agents != next_info.agents: |
|
|
|
raise UnityTrainerException("Training with Curiosity-driven exploration" |
|
|
|
" and On-Demand Decision making is currently not supported.") |
|
|
|
feed_dict = {self.model.batch_size: len(curr_info.vector_observations), self.model.sequence_length: 1} |
|
|
|
feed_dict = {self.model.batch_size: len(next_info.vector_observations), self.model.sequence_length: 1} |
|
|
|
|
|
|
|
if curr_info.agents != next_info.agents: |
|
|
|
curr_info = self.construct_curr_info(next_info) |
|
|
|
|
|
|
|
if self.use_visual_obs: |
|
|
|
for i in range(len(curr_info.visual_observations)): |
|
|
|
feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i] |
|
|
|
|
|
|
curr_info = curr_all_info[self.brain_name] |
|
|
|
next_info = next_all_info[self.brain_name] |
|
|
|
|
|
|
|
intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info) |
|
|
|
|
|
|
|
|
|
|
|
intrinsic_rewards = self.generate_intrinsic_rewards(curr_info, next_info) |
|
|
|
|
|
|
|
for agent_id in next_info.agents: |
|
|
|
stored_info = self.training_buffer[agent_id].last_brain_info |
|
|
|