import logging import numpy as np from mlagents.trainers.bc.models import BehavioralCloningModel from mlagents.trainers.policy import Policy logger = logging.getLogger("mlagents.trainers") class BCPolicy(Policy): def __init__(self, seed, brain, trainer_parameters, load): """ :param seed: Random seed. :param brain: Assigned Brain object. :param trainer_parameters: Defined training parameters. :param load: Whether a pre-trained model will be loaded or a new one created. """ super(BCPolicy, self).__init__(seed, brain, trainer_parameters) with self.graph.as_default(): with self.graph.as_default(): self.model = BehavioralCloningModel( h_size=int(trainer_parameters["hidden_units"]), lr=float(trainer_parameters["learning_rate"]), n_layers=int(trainer_parameters["num_layers"]), m_size=self.m_size, normalize=False, use_recurrent=trainer_parameters["use_recurrent"], brain=brain, seed=seed, ) if load: self._load_graph() else: self._initialize_graph() self.inference_dict = {"action": self.model.sample_action} self.update_dict = { "policy_loss": self.model.loss, "update_batch": self.model.update, } if self.use_recurrent: self.inference_dict["memory_out"] = self.model.memory_out self.evaluate_rate = 1.0 self.update_rate = 0.5 def evaluate(self, brain_info): """ Evaluates policy for the agent experiences provided. :param brain_info: BrainInfo input to network. :return: Results of evaluation. """ feed_dict = { self.model.dropout_rate: self.evaluate_rate, self.model.sequence_length: 1, } feed_dict = self._fill_eval_dict(feed_dict, brain_info) if self.use_recurrent: if brain_info.memories.shape[1] == 0: brain_info.memories = self.make_empty_memory(len(brain_info.agents)) feed_dict[self.model.memory_in] = brain_info.memories run_out = self._execute_model(feed_dict, self.inference_dict) return run_out def update(self, mini_batch, num_sequences): """ Performs update on model. :param mini_batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ feed_dict = { self.model.dropout_rate: self.update_rate, self.model.batch_size: num_sequences, self.model.sequence_length: self.sequence_length, } if self.use_continuous_act: feed_dict[self.model.true_action] = mini_batch["actions"].reshape( [-1, self.brain.vector_action_space_size[0]] ) else: feed_dict[self.model.true_action] = mini_batch["actions"].reshape( [-1, len(self.brain.vector_action_space_size)] ) feed_dict[self.model.action_masks] = np.ones( (num_sequences, sum(self.brain.vector_action_space_size)) ) if self.use_vec_obs: apparent_obs_size = ( self.brain.vector_observation_space_size * self.brain.num_stacked_vector_observations ) feed_dict[self.model.vector_in] = mini_batch["vector_obs"].reshape( [-1, apparent_obs_size] ) for i, _ in enumerate(self.model.visual_in): visual_obs = mini_batch["visual_obs%d" % i] feed_dict[self.model.visual_in[i]] = visual_obs if self.use_recurrent: feed_dict[self.model.memory_in] = np.zeros([num_sequences, self.m_size]) run_out = self._execute_model(feed_dict, self.update_dict) return run_out