GitHub
6 年前
当前提交
fbf92810
共有 21 个文件被更改,包括 864 次插入 和 546 次删除
-
1MLAgentsSDK/ProjectSettings/EditorBuildSettings.asset
-
2MLAgentsSDK/ProjectSettings/ProjectVersion.txt
-
4MLAgentsSDK/ProjectSettings/UnityConnectSettings.asset
-
3ml-agents/mlagents/trainers/__init__.py
-
1ml-agents/mlagents/trainers/bc/__init__.py
-
94ml-agents/mlagents/trainers/bc/models.py
-
143ml-agents/mlagents/trainers/bc/trainer.py
-
13ml-agents/mlagents/trainers/buffer.py
-
147ml-agents/mlagents/trainers/models.py
-
1ml-agents/mlagents/trainers/ppo/__init__.py
-
57ml-agents/mlagents/trainers/ppo/models.py
-
303ml-agents/mlagents/trainers/ppo/trainer.py
-
28ml-agents/mlagents/trainers/trainer.py
-
56ml-agents/mlagents/trainers/trainer_controller.py
-
50ml-agents/tests/trainers/test_bc.py
-
23ml-agents/tests/trainers/test_buffer.py
-
52ml-agents/tests/trainers/test_ppo.py
-
1ml-agents/tests/trainers/test_trainer_controller.py
-
87ml-agents/mlagents/trainers/bc/policy.py
-
146ml-agents/mlagents/trainers/policy.py
-
198ml-agents/mlagents/trainers/ppo/policy.py
|
|||
m_EditorVersion: 2017.1.0f3 |
|||
m_EditorVersion: 2017.1.5f1 |
|
|||
from .models import * |
|||
from .trainer import * |
|||
from .policy import * |
|
|||
from .models import * |
|||
from .trainer import * |
|||
from .policy import * |
|
|||
import logging |
|||
|
|||
import numpy as np |
|||
from mlagents.trainers.bc.models import BehavioralCloningModel |
|||
from mlagents.trainers.policy import Policy |
|||
|
|||
logger = logging.getLogger("unityagents") |
|||
|
|||
|
|||
class BCPolicy(Policy): |
|||
def __init__(self, seed, brain, trainer_parameters, sess): |
|||
""" |
|||
:param seed: Random seed. |
|||
:param brain: Assigned Brain object. |
|||
:param trainer_parameters: Defined training parameters. |
|||
:param sess: TensorFlow session. |
|||
""" |
|||
super().__init__(seed, brain, trainer_parameters, sess) |
|||
|
|||
self.model = BehavioralCloningModel( |
|||
h_size=int(trainer_parameters['hidden_units']), |
|||
lr=float(trainer_parameters['learning_rate']), |
|||
n_layers=int(trainer_parameters['num_layers']), |
|||
m_size=self.m_size, |
|||
normalize=False, |
|||
use_recurrent=trainer_parameters['use_recurrent'], |
|||
brain=brain, |
|||
scope=self.variable_scope, |
|||
seed=seed) |
|||
|
|||
self.inference_dict = {'action': self.model.sample_action} |
|||
self.update_dict = {'policy_loss': self.model.loss, |
|||
'update_batch': self.model.update} |
|||
if self.use_recurrent: |
|||
self.inference_dict['memory_out'] = self.model.memory_out |
|||
|
|||
self.evaluate_rate = 1.0 |
|||
self.update_rate = 0.5 |
|||
|
|||
def evaluate(self, brain_info): |
|||
""" |
|||
Evaluates policy for the agent experiences provided. |
|||
:param brain_info: BrainInfo input to network. |
|||
:return: Results of evaluation. |
|||
""" |
|||
feed_dict = {self.model.dropout_rate: self.evaluate_rate, |
|||
self.model.sequence_length: 1} |
|||
|
|||
feed_dict = self._fill_eval_dict(feed_dict, brain_info) |
|||
if self.use_recurrent: |
|||
if brain_info.memories.shape[1] == 0: |
|||
brain_info.memories = self.make_empty_memory(len(brain_info.agents)) |
|||
feed_dict[self.model.memory_in] = brain_info.memories |
|||
run_out = self._execute_model(feed_dict, self.inference_dict) |
|||
return run_out |
|||
|
|||
def update(self, mini_batch, num_sequences): |
|||
""" |
|||
Performs update on model. |
|||
:param mini_batch: Batch of experiences. |
|||
:param num_sequences: Number of sequences to process. |
|||
:return: Results of update. |
|||
""" |
|||
|
|||
feed_dict = {self.model.dropout_rate: self.update_rate, |
|||
self.model.batch_size: num_sequences, |
|||
self.model.sequence_length: self.sequence_length} |
|||
if self.use_continuous_act: |
|||
feed_dict[self.model.true_action] = mini_batch['actions']. \ |
|||
reshape([-1, self.brain.vector_action_space_size[0]]) |
|||
else: |
|||
feed_dict[self.model.true_action] = mini_batch['actions'].reshape( |
|||
[-1, len(self.brain.vector_action_space_size)]) |
|||
feed_dict[self.model.action_masks] = np.ones( |
|||
(num_sequences, sum(self.brain.vector_action_space_size))) |
|||
if self.use_vec_obs: |
|||
apparent_obs_size = self.brain.vector_observation_space_size * \ |
|||
self.brain.num_stacked_vector_observations |
|||
feed_dict[self.model.vector_in] = mini_batch['vector_obs'] \ |
|||
.reshape([-1,apparent_obs_size]) |
|||
for i, _ in enumerate(self.model.visual_in): |
|||
visual_obs = mini_batch['visual_obs%d' % i] |
|||
feed_dict[self.model.visual_in[i]] = visual_obs |
|||
if self.use_recurrent: |
|||
feed_dict[self.model.memory_in] = np.zeros([num_sequences, self.m_size]) |
|||
run_out = self._execute_model(feed_dict, self.update_dict) |
|||
return run_out |
|
|||
import logging |
|||
import numpy as np |
|||
|
|||
from mlagents.trainers import UnityException |
|||
from mlagents.trainers.models import LearningModel |
|||
|
|||
logger = logging.getLogger("unityagents") |
|||
|
|||
|
|||
class UnityPolicyException(UnityException): |
|||
""" |
|||
Related to errors with the Trainer. |
|||
""" |
|||
pass |
|||
|
|||
|
|||
class Policy(object): |
|||
""" |
|||
Contains a learning model, and the necessary |
|||
functions to interact with it to perform evaluate and updating. |
|||
""" |
|||
|
|||
def __init__(self, seed, brain, trainer_parameters, sess): |
|||
""" |
|||
Initialized the policy. |
|||
:param seed: Random seed to use for TensorFlow. |
|||
:param brain: The corresponding Brain for this policy. |
|||
:param trainer_parameters: The trainer parameters. |
|||
:param sess: The current TensorFlow session. |
|||
""" |
|||
self.m_size = None |
|||
self.model = LearningModel(0, False, False, brain, scope='Model', seed=0) |
|||
self.inference_dict = {} |
|||
self.update_dict = {} |
|||
self.sequence_length = 1 |
|||
self.seed = seed |
|||
self.brain = brain |
|||
self.variable_scope = trainer_parameters['graph_scope'] |
|||
self.use_recurrent = trainer_parameters["use_recurrent"] |
|||
self.use_continuous_act = (brain.vector_action_space_type == "continuous") |
|||
self.sess = sess |
|||
if self.use_recurrent: |
|||
self.m_size = trainer_parameters["memory_size"] |
|||
self.sequence_length = trainer_parameters["sequence_length"] |
|||
if self.m_size == 0: |
|||
raise UnityPolicyException("The memory size for brain {0} is 0 even " |
|||
"though the trainer uses recurrent." |
|||
.format(brain.brain_name)) |
|||
elif self.m_size % 4 != 0: |
|||
raise UnityPolicyException("The memory size for brain {0} is {1} " |
|||
"but it must be divisible by 4." |
|||
.format(brain.brain_name, self.m_size)) |
|||
|
|||
def evaluate(self, brain_info): |
|||
""" |
|||
Evaluates policy for the agent experiences provided. |
|||
:param brain_info: BrainInfo input to network. |
|||
:return: Output from policy based on self.inference_dict. |
|||
""" |
|||
raise UnityPolicyException("The evaluate function was not implemented.") |
|||
|
|||
def update(self, mini_batch, num_sequences): |
|||
""" |
|||
Performs update of the policy. |
|||
:param num_sequences: Number of experience trajectories in batch. |
|||
:param mini_batch: Batch of experiences. |
|||
:return: Results of update. |
|||
""" |
|||
raise UnityPolicyException("The update function was not implemented.") |
|||
|
|||
def _execute_model(self, feed_dict, out_dict): |
|||
""" |
|||
Executes model. |
|||
:param feed_dict: Input dictionary mapping nodes to input data. |
|||
:param out_dict: Output dictionary mapping names to nodes. |
|||
:return: Dictionary mapping names to input data. |
|||
""" |
|||
network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict) |
|||
run_out = dict(zip(list(out_dict.keys()), network_out)) |
|||
return run_out |
|||
|
|||
def _fill_eval_dict(self, feed_dict, brain_info): |
|||
for i, _ in enumerate(brain_info.visual_observations): |
|||
feed_dict[self.model.visual_in[i]] = brain_info.visual_observations[i] |
|||
if self.use_vec_obs: |
|||
feed_dict[self.model.vector_in] = brain_info.vector_observations |
|||
if not self.use_continuous_act: |
|||
feed_dict[self.model.action_masks] = brain_info.action_masks |
|||
return feed_dict |
|||
|
|||
def make_empty_memory(self, num_agents): |
|||
""" |
|||
Creates empty memory for use with RNNs |
|||
:param num_agents: Number of agents. |
|||
:return: Numpy array of zeros. |
|||
""" |
|||
return np.zeros((num_agents, self.m_size)) |
|||
|
|||
@property |
|||
def graph_scope(self): |
|||
""" |
|||
Returns the graph scope of the trainer. |
|||
""" |
|||
return self.variable_scope |
|||
|
|||
def get_current_step(self): |
|||
""" |
|||
Gets current model step. |
|||
:return: current model step. |
|||
""" |
|||
step = self.sess.run(self.model.global_step) |
|||
return step |
|||
|
|||
def increment_step(self): |
|||
""" |
|||
Increments model step. |
|||
""" |
|||
self.sess.run(self.model.increment_step) |
|||
|
|||
def get_inference_vars(self): |
|||
""" |
|||
:return:list of inference var names |
|||
""" |
|||
return list(self.inference_dict.keys()) |
|||
|
|||
def get_update_vars(self): |
|||
""" |
|||
:return:list of update var names |
|||
""" |
|||
return list(self.update_dict.keys()) |
|||
|
|||
@property |
|||
def vis_obs_size(self): |
|||
return self.model.vis_obs_size |
|||
|
|||
@property |
|||
def vec_obs_size(self): |
|||
return self.model.vec_obs_size |
|||
|
|||
@property |
|||
def use_vis_obs(self): |
|||
return self.model.vis_obs_size > 0 |
|||
|
|||
@property |
|||
def use_vec_obs(self): |
|||
return self.model.vec_obs_size > 0 |
|
|||
import logging |
|||
|
|||
import numpy as np |
|||
from mlagents.trainers.ppo.models import PPOModel |
|||
from mlagents.trainers.policy import Policy |
|||
|
|||
logger = logging.getLogger("unityagents") |
|||
|
|||
|
|||
class PPOPolicy(Policy): |
|||
def __init__(self, seed, brain, trainer_params, sess, is_training): |
|||
""" |
|||
Policy for Proximal Policy Optimization Networks. |
|||
:param seed: Random seed. |
|||
:param brain: Assigned Brain object. |
|||
:param trainer_params: Defined training parameters. |
|||
:param sess: TensorFlow session. |
|||
:param is_training: Whether the model should be trained. |
|||
""" |
|||
super().__init__(seed, brain, trainer_params, sess) |
|||
self.has_updated = False |
|||
self.use_curiosity = bool(trainer_params['use_curiosity']) |
|||
self.model = PPOModel(brain, |
|||
lr=float(trainer_params['learning_rate']), |
|||
h_size=int(trainer_params['hidden_units']), |
|||
epsilon=float(trainer_params['epsilon']), |
|||
beta=float(trainer_params['beta']), |
|||
max_step=float(trainer_params['max_steps']), |
|||
normalize=trainer_params['normalize'], |
|||
use_recurrent=trainer_params['use_recurrent'], |
|||
num_layers=int(trainer_params['num_layers']), |
|||
m_size=self.m_size, |
|||
use_curiosity=bool(trainer_params['use_curiosity']), |
|||
curiosity_strength=float(trainer_params['curiosity_strength']), |
|||
curiosity_enc_size=float(trainer_params['curiosity_enc_size']), |
|||
scope=self.variable_scope, seed=seed) |
|||
|
|||
self.inference_dict = {'action': self.model.output, 'log_probs': self.model.all_log_probs, |
|||
'value': self.model.value, 'entropy': self.model.entropy, |
|||
'learning_rate': self.model.learning_rate} |
|||
if self.use_continuous_act: |
|||
self.inference_dict['pre_action'] = self.model.output_pre |
|||
if self.use_recurrent: |
|||
self.inference_dict['memory_out'] = self.model.memory_out |
|||
if is_training and self.use_vec_obs and trainer_params['normalize']: |
|||
self.inference_dict['update_mean'] = self.model.update_mean |
|||
self.inference_dict['update_variance'] = self.model.update_variance |
|||
|
|||
self.update_dict = {'value_loss': self.model.value_loss, |
|||
'policy_loss': self.model.policy_loss, |
|||
'update_batch': self.model.update_batch} |
|||
if self.use_curiosity: |
|||
self.update_dict['forward_loss'] = self.model.forward_loss |
|||
self.update_dict['inverse_loss'] = self.model.inverse_loss |
|||
|
|||
def evaluate(self, brain_info): |
|||
""" |
|||
Evaluates policy for the agent experiences provided. |
|||
:param brain_info: BrainInfo object containing inputs. |
|||
:return: Outputs from network as defined by self.inference_dict. |
|||
""" |
|||
feed_dict = {self.model.batch_size: len(brain_info.vector_observations), |
|||
self.model.sequence_length: 1} |
|||
if self.use_recurrent: |
|||
if not self.use_continuous_act: |
|||
feed_dict[self.model.prev_action] = brain_info.previous_vector_actions.reshape( |
|||
[-1, len(self.model.act_size)]) |
|||
if brain_info.memories.shape[1] == 0: |
|||
brain_info.memories = self.make_empty_memory(len(brain_info.agents)) |
|||
feed_dict[self.model.memory_in] = brain_info.memories |
|||
feed_dict = self._fill_eval_dict(feed_dict, brain_info) |
|||
run_out = self._execute_model(feed_dict, self.inference_dict) |
|||
return run_out |
|||
|
|||
def update(self, mini_batch, num_sequences): |
|||
""" |
|||
Updates model using buffer. |
|||
:param num_sequences: Number of trajectories in batch. |
|||
:param mini_batch: Experience batch. |
|||
:return: Output from update process. |
|||
""" |
|||
feed_dict = {self.model.batch_size: num_sequences, |
|||
self.model.sequence_length: self.sequence_length, |
|||
self.model.mask_input: mini_batch['masks'].flatten(), |
|||
self.model.returns_holder: mini_batch['discounted_returns'].flatten(), |
|||
self.model.old_value: mini_batch['value_estimates'].flatten(), |
|||
self.model.advantage: mini_batch['advantages'].reshape([-1, 1]), |
|||
self.model.all_old_log_probs: mini_batch['action_probs'].reshape( |
|||
[-1, sum(self.model.act_size)])} |
|||
if self.use_continuous_act: |
|||
feed_dict[self.model.output_pre] = mini_batch['actions_pre'].reshape( |
|||
[-1, self.model.act_size[0]]) |
|||
else: |
|||
feed_dict[self.model.action_holder] = mini_batch['actions'].reshape( |
|||
[-1, len(self.model.act_size)]) |
|||
if self.use_recurrent: |
|||
feed_dict[self.model.prev_action] = mini_batch['prev_action'].reshape( |
|||
[-1, len(self.model.act_size)]) |
|||
feed_dict[self.model.action_masks] = mini_batch['action_mask'].reshape( |
|||
[-1, sum(self.brain.vector_action_space_size)]) |
|||
if self.use_vec_obs: |
|||
feed_dict[self.model.vector_in] = mini_batch['vector_obs'].reshape( |
|||
[-1, self.vec_obs_size]) |
|||
if self.use_curiosity: |
|||
feed_dict[self.model.next_vector_in] = mini_batch['next_vector_in'].reshape( |
|||
[-1, self.vec_obs_size]) |
|||
if self.model.vis_obs_size > 0: |
|||
for i, _ in enumerate(self.model.visual_in): |
|||
_obs = mini_batch['visual_obs%d' % i] |
|||
if self.sequence_length > 1 and self.use_recurrent: |
|||
(_batch, _seq, _w, _h, _c) = _obs.shape |
|||
feed_dict[self.model.visual_in[i]] = _obs.reshape([-1, _w, _h, _c]) |
|||
else: |
|||
feed_dict[self.model.visual_in[i]] = _obs |
|||
if self.use_curiosity: |
|||
for i, _ in enumerate(self.model.visual_in): |
|||
_obs = mini_batch['next_visual_obs%d' % i] |
|||
if self.sequence_length > 1 and self.use_recurrent: |
|||
(_batch, _seq, _w, _h, _c) = _obs.shape |
|||
feed_dict[self.model.next_visual_in[i]] = _obs.reshape([-1, _w, _h, _c]) |
|||
else: |
|||
feed_dict[self.model.next_visual_in[i]] = _obs |
|||
if self.use_recurrent: |
|||
mem_in = mini_batch['memory'][:, 0, :] |
|||
feed_dict[self.model.memory_in] = mem_in |
|||
self.has_updated = True |
|||
run_out = self._execute_model(feed_dict, self.update_dict) |
|||
return run_out |
|||
|
|||
def get_intrinsic_rewards(self, curr_info, next_info): |
|||
""" |
|||
Generates intrinsic reward used for Curiosity-based training. |
|||
:BrainInfo curr_info: Current BrainInfo. |
|||
:BrainInfo next_info: Next BrainInfo. |
|||
:return: Intrinsic rewards for all agents. |
|||
""" |
|||
if self.use_curiosity: |
|||
if len(curr_info.agents) == 0: |
|||
return [] |
|||
|
|||
feed_dict = {self.model.batch_size: len(next_info.vector_observations), |
|||
self.model.sequence_length: 1} |
|||
if self.use_continuous_act: |
|||
feed_dict[self.model.output] = next_info.previous_vector_actions |
|||
else: |
|||
feed_dict[self.model.action_holder] = next_info.previous_vector_actions |
|||
for i in range(self.model.vis_obs_size): |
|||
feed_dict[self.model.visual_in[i]] = curr_info.visual_observations[i] |
|||
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i] |
|||
if self.use_vec_obs: |
|||
feed_dict[self.model.vector_in] = curr_info.vector_observations |
|||
feed_dict[self.model.next_vector_in] = next_info.vector_observations |
|||
if self.use_recurrent: |
|||
if curr_info.memories.shape[1] == 0: |
|||
curr_info.memories = self.make_empty_memory(len(curr_info.agents)) |
|||
feed_dict[self.model.memory_in] = curr_info.memories |
|||
intrinsic_rewards = self.sess.run(self.model.intrinsic_reward, |
|||
feed_dict=feed_dict) * float(self.has_updated) |
|||
return intrinsic_rewards |
|||
else: |
|||
return None |
|||
|
|||
def get_value_estimate(self, brain_info, idx): |
|||
""" |
|||
Generates value estimates for bootstrapping. |
|||
:param brain_info: BrainInfo to be used for bootstrapping. |
|||
:param idx: Index in BrainInfo of agent. |
|||
:return: Value estimate. |
|||
""" |
|||
feed_dict = {self.model.batch_size: 1, self.model.sequence_length: 1} |
|||
for i in range(len(brain_info.visual_observations)): |
|||
feed_dict[self.model.visual_in[i]] = [brain_info.visual_observations[i][idx]] |
|||
if self.use_vec_obs: |
|||
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]] |
|||
if self.use_recurrent: |
|||
if brain_info.memories.shape[1] == 0: |
|||
brain_info.memories = self.make_empty_memory(len(brain_info.agents)) |
|||
feed_dict[self.model.memory_in] = [brain_info.memories[idx]] |
|||
if not self.use_continuous_act and self.use_recurrent: |
|||
feed_dict[self.model.prev_action] = brain_info.previous_vector_actions[idx].reshape( |
|||
[-1, len(self.model.act_size)]) |
|||
value_estimate = self.sess.run(self.model.value, feed_dict) |
|||
return value_estimate |
|||
|
|||
def get_last_reward(self): |
|||
""" |
|||
Returns the last reward the trainer has had |
|||
:return: the new last reward |
|||
""" |
|||
return self.sess.run(self.model.last_reward) |
|||
|
|||
def update_reward(self, new_reward): |
|||
""" |
|||
Updates reward value for policy. |
|||
:param new_reward: New reward to save. |
|||
""" |
|||
self.sess.run(self.model.update_reward, |
|||
feed_dict={self.model.new_reward: new_reward}) |
撰写
预览
正在加载...
取消
保存
Reference in new issue