GitHub
5 年前
当前提交
7d954797
共有 8 个文件被更改,包括 489 次插入 和 147 次删除
-
159ml-agents/mlagents/trainers/common/nn_policy.py
-
4ml-agents/mlagents/trainers/ppo/optimizer.py
-
4ml-agents/mlagents/trainers/sac/optimizer.py
-
10ml-agents/mlagents/trainers/tests/test_bcmodule.py
-
4ml-agents/mlagents/trainers/tests/test_simple_rl.py
-
5ml-agents/mlagents/trainers/tf_policy.py
-
310ml-agents/mlagents/trainers/common/distributions.py
-
140ml-agents/mlagents/trainers/tests/test_distributions.py
|
|||
import abc |
|||
from typing import NamedTuple, List, Tuple |
|||
import numpy as np |
|||
|
|||
from mlagents.tf_utils import tf |
|||
from mlagents.trainers.models import ModelUtils |
|||
|
|||
EPSILON = 1e-6 # Small value to avoid divide by zero |
|||
|
|||
|
|||
class OutputDistribution(abc.ABC): |
|||
@abc.abstractproperty |
|||
def log_probs(self) -> tf.Tensor: |
|||
""" |
|||
Returns a Tensor that when evaluated, produces the per-action log probabilities of this distribution. |
|||
The shape of this Tensor should be equivalent to (batch_size x the number of actions) produced in sample. |
|||
""" |
|||
pass |
|||
|
|||
@abc.abstractproperty |
|||
def total_log_probs(self) -> tf.Tensor: |
|||
""" |
|||
Returns a Tensor that when evaluated, produces the total log probability for a single sample. |
|||
The shape of this Tensor should be equivalent to (batch_size x 1) produced in sample. |
|||
""" |
|||
pass |
|||
|
|||
@abc.abstractproperty |
|||
def sample(self) -> tf.Tensor: |
|||
""" |
|||
Returns a Tensor that when evaluated, produces a sample of this OutputDistribution. |
|||
""" |
|||
pass |
|||
|
|||
@abc.abstractproperty |
|||
def entropy(self) -> tf.Tensor: |
|||
""" |
|||
Returns a Tensor that when evaluated, produces the entropy of this distribution. |
|||
""" |
|||
pass |
|||
|
|||
|
|||
class DiscreteOutputDistribution(OutputDistribution): |
|||
@abc.abstractproperty |
|||
def sample_onehot(self) -> tf.Tensor: |
|||
""" |
|||
Returns a one-hot version of the output. |
|||
""" |
|||
|
|||
|
|||
class GaussianDistribution(OutputDistribution): |
|||
""" |
|||
A Gaussian output distribution for continuous actions. |
|||
""" |
|||
|
|||
class MuSigmaTensors(NamedTuple): |
|||
mu: tf.Tensor |
|||
log_sigma: tf.Tensor |
|||
sigma: tf.Tensor |
|||
|
|||
def __init__( |
|||
self, |
|||
logits: tf.Tensor, |
|||
act_size: List[int], |
|||
reparameterize: bool = False, |
|||
tanh_squash: bool = False, |
|||
log_sigma_min: float = -20, |
|||
log_sigma_max: float = 2, |
|||
): |
|||
""" |
|||
A Gaussian output distribution for continuous actions. |
|||
:param logits: Hidden layer to use as the input to the Gaussian distribution. |
|||
:param act_size: List containing the number of continuous actions. |
|||
:param reparameterize: Whether or not to use the reparameterization trick (block gradients through |
|||
log probability calculation.) |
|||
:param tanh_squash: Squash the output using tanh, constraining it between -1 and 1. |
|||
From: Haarnoja et. al, https://arxiv.org/abs/1801.01290 |
|||
:param log_sigma_min: Minimum log standard deviation to clip by. |
|||
:param log_sigma_max: Maximum log standard deviation to clip by. |
|||
""" |
|||
encoded = self._create_mu_log_sigma( |
|||
logits, act_size, log_sigma_min, log_sigma_max |
|||
) |
|||
self._sampled_policy = self._create_sampled_policy(encoded) |
|||
if not reparameterize: |
|||
_sampled_policy_probs = tf.stop_gradient(self._sampled_policy) |
|||
else: |
|||
_sampled_policy_probs = self._sampled_policy |
|||
self._all_probs = self._create_log_probs(_sampled_policy_probs, encoded) |
|||
if tanh_squash: |
|||
self._sampled_policy = tf.tanh(self._sampled_policy) |
|||
self._all_probs = self._do_squash_correction_for_tanh( |
|||
self._all_probs, self._sampled_policy |
|||
) |
|||
self._total_prob = tf.reduce_sum(self._all_probs, axis=1, keepdims=True) |
|||
self._entropy = self._create_entropy(encoded) |
|||
|
|||
def _create_mu_log_sigma( |
|||
self, |
|||
logits: tf.Tensor, |
|||
act_size: List[int], |
|||
log_sigma_min: float, |
|||
log_sigma_max: float, |
|||
) -> "GaussianDistribution.MuSigmaTensors": |
|||
|
|||
mu = tf.layers.dense( |
|||
logits, |
|||
act_size[0], |
|||
activation=None, |
|||
name="mu", |
|||
kernel_initializer=ModelUtils.scaled_init(0.01), |
|||
reuse=tf.AUTO_REUSE, |
|||
) |
|||
|
|||
# Policy-dependent log_sigma_sq |
|||
log_sigma = tf.layers.dense( |
|||
logits, |
|||
act_size[0], |
|||
activation=None, |
|||
name="log_std", |
|||
kernel_initializer=ModelUtils.scaled_init(0.01), |
|||
) |
|||
log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max) |
|||
sigma = tf.exp(log_sigma) |
|||
return self.MuSigmaTensors(mu, log_sigma, sigma) |
|||
|
|||
def _create_sampled_policy( |
|||
self, encoded: "GaussianDistribution.MuSigmaTensors" |
|||
) -> tf.Tensor: |
|||
epsilon = tf.random_normal(tf.shape(encoded.mu)) |
|||
sampled_policy = encoded.mu + encoded.sigma * epsilon |
|||
|
|||
return sampled_policy |
|||
|
|||
def _create_log_probs( |
|||
self, sampled_policy: tf.Tensor, encoded: "GaussianDistribution.MuSigmaTensors" |
|||
) -> tf.Tensor: |
|||
_gauss_pre = -0.5 * ( |
|||
((sampled_policy - encoded.mu) / (encoded.sigma + EPSILON)) ** 2 |
|||
+ 2 * encoded.log_sigma |
|||
+ np.log(2 * np.pi) |
|||
) |
|||
return _gauss_pre |
|||
|
|||
def _create_entropy( |
|||
self, encoded: "GaussianDistribution.MuSigmaTensors" |
|||
) -> tf.Tensor: |
|||
single_dim_entropy = 0.5 * tf.reduce_mean( |
|||
tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma) |
|||
) |
|||
# Make entropy the right shape |
|||
return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy |
|||
|
|||
def _do_squash_correction_for_tanh(self, probs, squashed_policy): |
|||
""" |
|||
Adjust probabilities for squashed sample before output |
|||
""" |
|||
probs -= tf.log(1 - squashed_policy ** 2 + EPSILON) |
|||
return probs |
|||
|
|||
@property |
|||
def total_log_probs(self) -> tf.Tensor: |
|||
return self._total_prob |
|||
|
|||
@property |
|||
def log_probs(self) -> tf.Tensor: |
|||
return self._all_probs |
|||
|
|||
@property |
|||
def sample(self) -> tf.Tensor: |
|||
return self._sampled_policy |
|||
|
|||
@property |
|||
def entropy(self) -> tf.Tensor: |
|||
return self._entropy |
|||
|
|||
|
|||
class MultiCategoricalDistribution(DiscreteOutputDistribution): |
|||
""" |
|||
A categorical distribution for multi-branched discrete actions. Also supports action masking. |
|||
""" |
|||
|
|||
def __init__(self, logits: tf.Tensor, act_size: List[int], action_masks: tf.Tensor): |
|||
""" |
|||
A categorical distribution for multi-branched discrete actions. |
|||
:param logits: Hidden layer to use as the input to the Gaussian distribution. |
|||
:param act_size: List containing the number of discrete actions per branch. |
|||
:param action_masks: Tensor representing action masks. Should be of length sum(act_size), and 0 for masked |
|||
and 1 for unmasked. |
|||
""" |
|||
unmasked_log_probs = self._create_policy_branches(logits, act_size) |
|||
self._sampled_policy, self._all_probs, action_index = self._get_masked_actions_probs( |
|||
unmasked_log_probs, act_size, action_masks |
|||
) |
|||
self._sampled_onehot = self._action_onehot(self._sampled_policy, act_size) |
|||
self._entropy = self._create_entropy( |
|||
self._sampled_onehot, self._all_probs, action_index, act_size |
|||
) |
|||
self._total_prob = self._get_log_probs( |
|||
self._sampled_onehot, self._all_probs, action_index, act_size |
|||
) |
|||
|
|||
def _create_policy_branches( |
|||
self, logits: tf.Tensor, act_size: List[int] |
|||
) -> List[tf.Tensor]: |
|||
policy_branches = [] |
|||
for size in act_size: |
|||
policy_branches.append( |
|||
tf.layers.dense( |
|||
logits, |
|||
size, |
|||
activation=None, |
|||
use_bias=False, |
|||
kernel_initializer=ModelUtils.scaled_init(0.01), |
|||
) |
|||
) |
|||
unmasked_log_probs = tf.concat(policy_branches, axis=1) |
|||
return unmasked_log_probs |
|||
|
|||
def _get_masked_actions_probs( |
|||
self, |
|||
unmasked_log_probs: tf.Tensor, |
|||
act_size: List[int], |
|||
action_masks: tf.Tensor, |
|||
) -> Tuple[tf.Tensor, tf.Tensor, np.ndarray]: |
|||
output, _, all_log_probs = ModelUtils.create_discrete_action_masking_layer( |
|||
unmasked_log_probs, action_masks, act_size |
|||
) |
|||
|
|||
action_idx = [0] + list(np.cumsum(act_size)) |
|||
return output, all_log_probs, action_idx |
|||
|
|||
def _action_onehot(self, sample: tf.Tensor, act_size: List[int]) -> tf.Tensor: |
|||
action_oh = tf.concat( |
|||
[tf.one_hot(sample[:, i], act_size[i]) for i in range(len(act_size))], |
|||
axis=1, |
|||
) |
|||
return action_oh |
|||
|
|||
def _get_log_probs( |
|||
self, |
|||
sample_onehot: tf.Tensor, |
|||
all_log_probs: tf.Tensor, |
|||
action_idx: List[int], |
|||
act_size: List[int], |
|||
) -> tf.Tensor: |
|||
log_probs = tf.reduce_sum( |
|||
( |
|||
tf.stack( |
|||
[ |
|||
-tf.nn.softmax_cross_entropy_with_logits_v2( |
|||
labels=sample_onehot[:, action_idx[i] : action_idx[i + 1]], |
|||
logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]], |
|||
) |
|||
for i in range(len(act_size)) |
|||
], |
|||
axis=1, |
|||
) |
|||
), |
|||
axis=1, |
|||
keepdims=True, |
|||
) |
|||
return log_probs |
|||
|
|||
def _create_entropy( |
|||
self, |
|||
all_log_probs: tf.Tensor, |
|||
sample_onehot: tf.Tensor, |
|||
action_idx: List[int], |
|||
act_size: List[int], |
|||
) -> tf.Tensor: |
|||
entropy = tf.reduce_sum( |
|||
( |
|||
tf.stack( |
|||
[ |
|||
tf.nn.softmax_cross_entropy_with_logits_v2( |
|||
labels=tf.nn.softmax( |
|||
all_log_probs[:, action_idx[i] : action_idx[i + 1]] |
|||
), |
|||
logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]], |
|||
) |
|||
for i in range(len(act_size)) |
|||
], |
|||
axis=1, |
|||
) |
|||
), |
|||
axis=1, |
|||
) |
|||
|
|||
return entropy |
|||
|
|||
@property |
|||
def log_probs(self) -> tf.Tensor: |
|||
return self._all_probs |
|||
|
|||
@property |
|||
def total_log_probs(self) -> tf.Tensor: |
|||
return self._total_prob |
|||
|
|||
@property |
|||
def sample(self) -> tf.Tensor: |
|||
return self._sampled_policy |
|||
|
|||
@property |
|||
def sample_onehot(self) -> tf.Tensor: |
|||
return self._sampled_onehot |
|||
|
|||
@property |
|||
def entropy(self) -> tf.Tensor: |
|||
return self._entropy |
|
|||
import pytest |
|||
|
|||
from mlagents.tf_utils import tf |
|||
|
|||
import yaml |
|||
|
|||
from mlagents.trainers.common.distributions import ( |
|||
GaussianDistribution, |
|||
MultiCategoricalDistribution, |
|||
) |
|||
|
|||
|
|||
@pytest.fixture |
|||
def dummy_config(): |
|||
return yaml.safe_load( |
|||
""" |
|||
trainer: ppo |
|||
batch_size: 32 |
|||
beta: 5.0e-3 |
|||
buffer_size: 512 |
|||
epsilon: 0.2 |
|||
hidden_units: 128 |
|||
lambd: 0.95 |
|||
learning_rate: 3.0e-4 |
|||
max_steps: 5.0e4 |
|||
normalize: true |
|||
num_epoch: 5 |
|||
num_layers: 2 |
|||
time_horizon: 64 |
|||
sequence_length: 64 |
|||
summary_freq: 1000 |
|||
use_recurrent: false |
|||
normalize: true |
|||
memory_size: 8 |
|||
curiosity_strength: 0.0 |
|||
curiosity_enc_size: 1 |
|||
summary_path: test |
|||
model_path: test |
|||
reward_signals: |
|||
extrinsic: |
|||
strength: 1.0 |
|||
gamma: 0.99 |
|||
""" |
|||
) |
|||
|
|||
|
|||
VECTOR_ACTION_SPACE = [2] |
|||
VECTOR_OBS_SPACE = 8 |
|||
DISCRETE_ACTION_SPACE = [3, 3, 3, 2] |
|||
BUFFER_INIT_SAMPLES = 32 |
|||
NUM_AGENTS = 12 |
|||
|
|||
|
|||
def test_gaussian_distribution(): |
|||
with tf.Graph().as_default(): |
|||
logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32) |
|||
distribution = GaussianDistribution( |
|||
logits, |
|||
act_size=VECTOR_ACTION_SPACE, |
|||
reparameterize=False, |
|||
tanh_squash=False, |
|||
) |
|||
sess = tf.Session() |
|||
with tf.Session() as sess: |
|||
init = tf.global_variables_initializer() |
|||
sess.run(init) |
|||
output = sess.run(distribution.sample) |
|||
for _ in range(10): |
|||
output = sess.run([distribution.sample, distribution.log_probs]) |
|||
for out in output: |
|||
assert out.shape[1] == VECTOR_ACTION_SPACE[0] |
|||
output = sess.run([distribution.total_log_probs]) |
|||
assert output[0].shape[0] == 1 |
|||
|
|||
|
|||
def test_tanh_distribution(): |
|||
with tf.Graph().as_default(): |
|||
logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32) |
|||
distribution = GaussianDistribution( |
|||
logits, act_size=VECTOR_ACTION_SPACE, reparameterize=False, tanh_squash=True |
|||
) |
|||
sess = tf.Session() |
|||
with tf.Session() as sess: |
|||
init = tf.global_variables_initializer() |
|||
sess.run(init) |
|||
output = sess.run(distribution.sample) |
|||
for _ in range(10): |
|||
output = sess.run([distribution.sample, distribution.log_probs]) |
|||
for out in output: |
|||
assert out.shape[1] == VECTOR_ACTION_SPACE[0] |
|||
# Assert action never exceeds [-1,1] |
|||
action = output[0][0] |
|||
for act in action: |
|||
assert act >= -1 and act <= 1 |
|||
output = sess.run([distribution.total_log_probs]) |
|||
assert output[0].shape[0] == 1 |
|||
|
|||
|
|||
def test_multicategorical_distribution(): |
|||
with tf.Graph().as_default(): |
|||
logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32) |
|||
action_masks = tf.Variable( |
|||
initial_value=[[1 for _ in range(sum(DISCRETE_ACTION_SPACE))]], |
|||
trainable=True, |
|||
dtype=tf.float32, |
|||
) |
|||
distribution = MultiCategoricalDistribution( |
|||
logits, act_size=DISCRETE_ACTION_SPACE, action_masks=action_masks |
|||
) |
|||
sess = tf.Session() |
|||
with tf.Session() as sess: |
|||
init = tf.global_variables_initializer() |
|||
sess.run(init) |
|||
output = sess.run(distribution.sample) |
|||
for _ in range(10): |
|||
sample, log_probs = sess.run( |
|||
[distribution.sample, distribution.log_probs] |
|||
) |
|||
assert len(log_probs[0]) == sum(DISCRETE_ACTION_SPACE) |
|||
# Assert action never exceeds [-1,1] |
|||
assert len(sample[0]) == len(DISCRETE_ACTION_SPACE) |
|||
for i, act in enumerate(sample[0]): |
|||
assert act >= 0 and act <= DISCRETE_ACTION_SPACE[i] |
|||
output = sess.run([distribution.total_log_probs]) |
|||
assert output[0].shape[0] == 1 |
|||
|
|||
# Test masks |
|||
mask = [] |
|||
for space in DISCRETE_ACTION_SPACE: |
|||
mask.append(1) |
|||
for _action_space in range(1, space): |
|||
mask.append(0) |
|||
for _ in range(10): |
|||
sample, log_probs = sess.run( |
|||
[distribution.sample, distribution.log_probs], |
|||
feed_dict={action_masks: [mask]}, |
|||
) |
|||
for act in sample[0]: |
|||
assert act >= 0 and act <= 1 |
|||
output = sess.run([distribution.total_log_probs]) |
撰写
预览
正在加载...
取消
保存
Reference in new issue