比较提交

...
此合并请求有变更与目标分支冲突。
/setup.cfg
/.pre-commit-config.yaml
/ml-agents/setup.py
/ml-agents/mlagents/trainers/trainer_controller.py
/ml-agents/mlagents/trainers/ppo/trainer.py
/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
/ml-agents/mlagents/trainers/trainer.py
/.circleci/config.yml
/test_constraints_max_tf2_version.txt
/ml-agents/mlagents/trainers/models.py
/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
/ml-agents/mlagents/trainers/bc/models.py
/ml-agents/mlagents/trainers/components/bc/model.py
/ml-agents/mlagents/trainers/components/reward_signals/__init__.py
/ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
/ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
/ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
/ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
/ml-agents/mlagents/trainers/tf_policy.py
/ml-agents/mlagents/trainers/sac/models.py
/ml-agents/mlagents/trainers/sac/policy.py
/ml-agents/mlagents/trainers/tests/test_bc.py
/ml-agents/mlagents/trainers/tests/test_multigpu.py
/ml-agents/mlagents/trainers/tests/test_ppo.py
/ml-agents/mlagents/trainers/tests/test_sac.py
/ml-agents/mlagents/trainers/ppo/models.py
/ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
/ml-agents/mlagents/trainers/ppo/policy.py
/test_constraints_max_tf2_version.txt

10 次代码提交

作者 SHA1 备注 提交日期
Ervin Teng 437c6c2f Add dummy save methods 5 年前
Ervin Teng ed2c35b9 Remove some comments 5 年前
Ervin Teng 3eb1e9c2 Pytorch port of continuous PPO 5 年前
Ervin Teng a665daed It's mostly training 5 年前
Ervin Teng 5e1c1a00 Tweaks to Policy 5 年前
Ervin Teng 5e6de46f Add normalizer 5 年前
Ervin Teng 9dbbfd77 Somewhat running 5 年前
Ervin Teng 748c250e Somewhat running 5 年前
Ervin Teng 987e0e3a Merge tf2 branch 5 年前
Ervin Teng e185844f Start on TF 2 policy 5 年前
共有 34 个文件被更改,包括 723 次插入230 次删除
  1. 8
      .circleci/config.yml
  2. 3
      .pre-commit-config.yaml
  3. 2
      setup.cfg
  4. 2
      test_constraints_max_tf2_version.txt
  5. 2
      ml-agents/setup.py
  6. 17
      ml-agents/mlagents/trainers/models.py
  7. 2
      ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
  8. 10
      ml-agents/mlagents/trainers/bc/models.py
  9. 19
      ml-agents/mlagents/trainers/trainer.py
  10. 2
      ml-agents/mlagents/trainers/trainer_controller.py
  11. 3
      ml-agents/mlagents/trainers/components/bc/model.py
  12. 2
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  13. 3
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py
  14. 3
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  15. 3
      ml-agents/mlagents/trainers/components/reward_signals/gail/model.py
  16. 2
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  17. 2
      ml-agents/mlagents/trainers/tf_policy.py
  18. 8
      ml-agents/mlagents/trainers/sac/models.py
  19. 2
      ml-agents/mlagents/trainers/sac/policy.py
  20. 2
      ml-agents/mlagents/trainers/tests/test_bc.py
  21. 2
      ml-agents/mlagents/trainers/tests/test_multigpu.py
  22. 3
      ml-agents/mlagents/trainers/tests/test_ppo.py
  23. 3
      ml-agents/mlagents/trainers/tests/test_sac.py
  24. 8
      ml-agents/mlagents/trainers/tests/test_trainer_controller.py
  25. 3
      ml-agents/mlagents/trainers/ppo/models.py
  26. 3
      ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py
  27. 26
      ml-agents/mlagents/trainers/ppo/trainer.py
  28. 501
      ml-agents/mlagents/trainers/ppo/policy.py
  29. 6
      test_constraints_max_tf1_version.txt
  30. 273
      ml-agents/mlagents/trainers/ppo/policy_old.py
  31. 2
      ml-agents/mlagents/tf_utils/__init__.py
  32. 26
      ml-agents/mlagents/tf_utils/tf.py
  33. 0
      /test_constraints_max_tf2_version.txt

8
.circleci/config.yml


executor: python373
pyversion: 3.7.3
# Test python 3.7 with the newest supported versions
pip_constraints: test_constraints_max_version.txt
pip_constraints: test_constraints_max_tf1_version.txt
- build_python:
name: python_3.7.3+tf2
executor: python373
pyversion: 3.7.3
# Test python 3.7 with the newest supported versions
pip_constraints: test_constraints_max_tf2_version.txt
- markdown_link_check
- protobuf_generation_check
- deploy:

3
.pre-commit-config.yaml


.*_pb2.py|
.*_pb2_grpc.py
)$
additional_dependencies: [flake8-comprehensions]
# flake8-tidy-imports is used for banned-modules, not actually tidying
additional_dependencies: [flake8-comprehensions, flake8-tidy-imports]
- id: trailing-whitespace
name: trailing-whitespace-markdown
types: [markdown]

2
setup.cfg


# Black tends to introduce things flake8 doesn't like, such as "line break before binary operator"
# or whitespace before ':'. Rather than fight with black, just ignore these for now.
W503, E203,
# flake-tidy-import adds this warning, which we don't really care about for now
I200

2
test_constraints_max_tf2_version.txt


# For projects with upper bounds, we should periodically update this list to the latest release version
grpcio>=1.23.0
numpy>=1.17.2
tensorflow>=1.14.0,<2.0
tensorflow>=2.0.0,<2.1.0

2
ml-agents/setup.py


"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml",
"tensorflow>=1.7,<2.0",
"tensorflow>=2.0",
'pypiwin32==223;platform_system=="Windows"',
],
python_requires=">=3.6.1",

17
ml-agents/mlagents/trainers/models.py


from typing import Callable, Dict, List, Optional
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as c_layers
from mlagents.tf_utils import tf, tf_variance_scaling, tf_rnn, tf_flatten
from mlagents.trainers.trainer import UnityTrainerException
from mlagents.envs.brain import CameraResolution

@staticmethod
def scaled_init(scale):
return c_layers.variance_scaling_initializer(scale)
return tf_variance_scaling(scale)
@staticmethod
def swish(input_activation: tf.Tensor) -> tf.Tensor:

activation=activation,
reuse=reuse,
name="hidden_{}".format(i),
kernel_initializer=c_layers.variance_scaling_initializer(1.0),
kernel_initializer=tf_variance_scaling(1.0),
)
return hidden

reuse=reuse,
name="conv_2",
)
hidden = c_layers.flatten(conv2)
hidden = tf_flatten(conv2)
with tf.variable_scope(scope + "/" + "flat_encoding"):
hidden_flat = LearningModel.create_vector_observation_encoder(

reuse=reuse,
name="conv_3",
)
hidden = c_layers.flatten(conv3)
hidden = tf_flatten(conv3)
with tf.variable_scope(scope + "/" + "flat_encoding"):
hidden_flat = LearningModel.create_vector_observation_encoder(

)
hidden = tf.add(block_input, hidden)
hidden = tf.nn.relu(hidden)
hidden = c_layers.flatten(hidden)
hidden = tf_flatten(hidden)
with tf.variable_scope(scope + "/" + "flat_encoding"):
hidden_flat = LearningModel.create_vector_observation_encoder(

memory_in = tf.reshape(memory_in[:, :], [-1, m_size])
half_point = int(m_size / 2)
with tf.variable_scope(name):
rnn_cell = tf.contrib.rnn.BasicLSTMCell(half_point)
lstm_vector_in = tf.contrib.rnn.LSTMStateTuple(
rnn_cell = tf_rnn.BasicLSTMCell(half_point)
lstm_vector_in = tf_rnn.LSTMStateTuple(
memory_in[:, :half_point], memory_in[:, half_point:]
)
recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(

2
ml-agents/mlagents/trainers/tensorflow_to_barracuda.py


from __future__ import print_function
import numpy as np
import struct # convert from Python values and C structs
import tensorflow as tf
from mlagents.tf_utils import tf
import re
# import barracuda

10
ml-agents/mlagents/trainers/bc/models.py


import tensorflow as tf
import tensorflow.contrib.layers as c_layers
from mlagents.tf_utils import tf, tf_variance_scaling
from mlagents.trainers.models import LearningModel

size,
activation=None,
use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(
factor=0.01
),
kernel_initializer=tf_variance_scaling(0.01),
)
)
self.action_probs = tf.concat(

activation=None,
use_bias=False,
name="pre_action",
kernel_initializer=c_layers.variance_scaling_initializer(factor=0.01),
kernel_initializer=tf_variance_scaling(0.01),
)
self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1)
self.sample_action = tf.identity(self.clipped_sample_action, name="action")

19
ml-agents/mlagents/trainers/trainer.py


from typing import Dict, List, Deque, Any
import os
import tensorflow as tf
import numpy as np
from collections import deque, defaultdict

self.trainer_metrics = TrainerMetrics(
path=self.summary_path + ".csv", brain_name=self.brain_name
)
self.summary_writer = tf.summary.FileWriter(self.summary_path)
self.summary_writer = tf.summary.create_file_writer(self.summary_path)
self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy: TFPolicy = None
self.step: int = 0

self.run_id, self.brain_name, step, is_training
)
)
summary = tf.Summary()
for key in self.stats:
if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
summary.value.add(tag="{}".format(key), simple_value=stat_mean)
self.stats[key] = []
summary.value.add(tag="Environment/Lesson", simple_value=lesson_num)
self.summary_writer.add_summary(summary, step)
self.summary_writer.flush()
with self.summary_writer.as_default():
for key in self.stats:
if len(self.stats[key]) > 0:
stat_mean = float(np.mean(self.stats[key]))
tf.summary.scalar("{}".format(key), stat_mean, step=step)
self.stats[key] = []
tf.summary.scalar("Environment/Lesson", lesson_num, step)
def write_tensorboard_text(self, key: str, input_dict: Dict[str, Any]) -> None:
"""

2
ml-agents/mlagents/trainers/trainer_controller.py


from typing import Dict, List, Optional, Set
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from time import time
from mlagents.envs.env_manager import EnvironmentStep

3
ml-agents/mlagents/trainers/components/bc/model.py


import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel

2
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


import numpy as np
import abc
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.trainer import UnityTrainerException

3
ml-agents/mlagents/trainers/components/reward_signals/curiosity/model.py


from typing import List, Tuple
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel

3
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from typing import Any, Dict, List
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult

3
ml-agents/mlagents/trainers/components/reward_signals/gail/model.py


from typing import List, Optional, Tuple
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel
EPSILON = 1e-7

2
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from typing import Any, Dict, List
import logging
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult

2
ml-agents/mlagents/trainers/tf_policy.py


from typing import Any, Dict, List, Optional
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.envs.exception import UnityException
from mlagents.envs.policy import Policy

8
ml-agents/mlagents/trainers/sac/models.py


import numpy as np
from typing import Dict, List, Optional
import tensorflow as tf
from mlagents.tf_utils import tf, tf_variance_scaling
import tensorflow.contrib.layers as c_layers
LOG_STD_MAX = 2
LOG_STD_MIN = -20

size,
activation=None,
use_bias=False,
kernel_initializer=c_layers.variance_scaling_initializer(
factor=0.01
),
kernel_initializer=tf_variance_scaling(0.01),
)
)
all_logits = tf.concat(

2
ml-agents/mlagents/trainers/sac/policy.py


import logging
from typing import Dict, Any, Optional
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.envs.timers import timed
from mlagents.envs.brain import BrainInfo, BrainParameters

2
ml-agents/mlagents/trainers/tests/test_bc.py


import os
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.bc.models import BehavioralCloningModel

2
ml-agents/mlagents/trainers/tests/test_multigpu.py


import unittest.mock as mock
import pytest
import tensorflow as tf
from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy

3
ml-agents/mlagents/trainers/tests/test_ppo.py


import pytest
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
import yaml
from mlagents.trainers.ppo.models import PPOModel

3
ml-agents/mlagents/trainers/tests/test_sac.py


import yaml
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.trainers.sac.models import SACModel
from mlagents.trainers.sac.policy import SACPolicy

8
ml-agents/mlagents/trainers/tests/test_trainer_controller.py


from unittest.mock import MagicMock, Mock, patch
from mlagents.tf_utils import tf
import yaml
import pytest

@patch("numpy.random.seed")
@patch("tensorflow.set_random_seed")
@patch.object(tf, "set_random_seed")
def test_initialization_seed(numpy_random_seed, tensorflow_set_seed):
seed = 27
TrainerController(

return tc, trainer_mock
@patch("tensorflow.reset_default_graph")
@patch.object(tf, "reset_default_graph")
def test_start_learning_trains_forever_if_no_train_model(tf_reset_graph):
tc, trainer_mock = trainer_controller_with_start_learning_mocks()
tc.train_model = False

env_mock.close.assert_called_once()
@patch("tensorflow.reset_default_graph")
@patch.object(tf, "reset_default_graph")
def test_start_learning_trains_until_max_steps_then_saves(tf_reset_graph):
tc, trainer_mock = trainer_controller_with_start_learning_mocks()
tf_reset_graph.return_value = None

3
ml-agents/mlagents/trainers/ppo/models.py


from typing import Optional
import numpy as np
import tensorflow as tf
from mlagents.tf_utils import tf
from mlagents.trainers.models import LearningModel, EncoderType, LearningRateSchedule
logger = logging.getLogger("mlagents.trainers")

3
ml-agents/mlagents/trainers/ppo/multi_gpu_policy.py


import logging
from typing import Any, Dict, List, Optional
import tensorflow as tf
from mlagents.tf_utils import tf
from tensorflow.python.client import device_lib
from mlagents.envs.brain import BrainParameters
from mlagents.envs.timers import timed

26
ml-agents/mlagents/trainers/ppo/trainer.py


:param next_info: Dictionary of all next brains and corresponding BrainInfo.
"""
info = next_info[self.brain_name]
if self.is_training:
self.policy.update_normalization(info.vector_observations)
# if self.is_training:
# self.policy.update_normalization(info.vector_observations)
for l in range(len(info.agents)):
agent_actions = self.training_buffer[info.agents[l]]["actions"]
if (

Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
epsilons = take_action_outputs["random_normal_epsilon"]
self.training_buffer[agent_id]["random_normal_epsilon"].append(
epsilons[agent_idx]
)
# if self.policy.use_continuous_act:
# actions_pre = take_action_outputs["pre_action"]
# self.training_buffer[agent_id]["actions_pre"].append(actions_pre[agent_idx])
# epsilons = take_action_outputs["random_normal_epsilon"]
# self.training_buffer[agent_id]["random_normal_epsilon"].append(
# epsilons[agent_idx]
# )
a_dist = take_action_outputs["log_probs"]
# value is a dictionary from name of reward to value estimate of the value head
self.training_buffer[agent_id]["actions"].append(actions[agent_idx])

for stat, stat_list in batch_update_stats.items():
self.stats[stat].append(np.mean(stat_list))
if self.policy.bc_module:
update_stats = self.policy.bc_module.update()
for stat, val in update_stats.items():
self.stats[stat].append(val)
# if self.policy.bc_module:
# update_stats = self.policy.bc_module.update()
# for stat, val in update_stats.items():
# self.stats[stat].append(val)
self.clear_update_buffer()
self.trainer_metrics.end_policy_update()

501
ml-agents/mlagents/trainers/ppo/policy.py


import logging
from typing import Any, Dict, Optional
from typing import Any, Dict # , Optional
# import tensorflow_probability as tfp
import torch
import torch.nn as nn
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.envs.action_info import ActionInfo
from mlagents.trainers.models import EncoderType # , LearningRateSchedule
from mlagents.trainers.components.bc.module import BCModule
class PPOPolicy(TFPolicy):
class VectorEncoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, **kwargs):
super(VectorEncoder, self).__init__(**kwargs)
self.layers = [nn.Linear(input_size, hidden_size)]
for i in range(num_layers - 1):
self.layers.append(nn.Linear(hidden_size, hidden_size))
self.layers.append(nn.ReLU())
print(self.layers)
def forward(self, inputs):
x = inputs
for layer in self.layers:
x = layer(x)
return x
class Critic(nn.Module):
def __init__(self, stream_names, hidden_size, encoder, **kwargs):
super(Critic, self).__init__(**kwargs)
self.stream_names = stream_names
self.encoder = encoder
self.value_heads = {}
for name in stream_names:
value = nn.Linear(hidden_size, 1)
self.value_heads[name] = value
def forward(self, inputs):
hidden = self.encoder(inputs)
value_outputs = {}
for stream_name, value in self.value_heads.items():
value_outputs[stream_name] = self.value_heads[stream_name](hidden)
return value_outputs
class GaussianDistribution(nn.Module):
def __init__(self, hidden_size, num_outputs, **kwargs):
super(GaussianDistribution, self).__init__(**kwargs)
self.mu = nn.Linear(hidden_size, num_outputs)
self.log_sigma_sq = nn.Linear(hidden_size, num_outputs)
nn.init.xavier_uniform(self.mu.weight, gain=0.01)
nn.init.xavier_uniform(self.log_sigma_sq.weight, gain=0.01)
def forward(self, inputs):
mu = self.mu(inputs)
log_sig = self.log_sigma_sq(inputs)
return torch.distributions.normal.Normal(
loc=mu, scale=torch.sqrt(torch.exp(log_sig))
)
class Normalizer(nn.Module):
def __init__(self, vec_obs_size, **kwargs):
super(Normalizer, self).__init__(**kwargs)
print(vec_obs_size)
self.normalization_steps = torch.tensor(1)
self.running_mean = torch.zeros(vec_obs_size)
self.running_variance = torch.ones(vec_obs_size)
def forward(self, inputs):
inputs = torch.from_numpy(inputs)
normalized_state = torch.clamp(
(inputs - self.running_mean)
/ torch.sqrt(
self.running_variance / self.normalization_steps.type(torch.float32)
),
-5,
5,
)
return normalized_state
def update(self, vector_input):
vector_input = torch.from_numpy(vector_input)
mean_current_observation = vector_input.mean(0).type(torch.float32)
new_mean = self.running_mean + (
mean_current_observation - self.running_mean
) / (self.normalization_steps + 1).type(torch.float32)
new_variance = self.running_variance + (mean_current_observation - new_mean) * (
mean_current_observation - self.running_mean
)
self.running_mean = new_mean
self.running_variance = new_variance
self.normalization_steps = self.normalization_steps + 1
class ActorCriticPolicy(nn.Module):
def __init__(
self,
h_size,
input_size,
act_size,
normalize,
num_layers,
m_size,
stream_names,
vis_encode_type,
):
super(ActorCriticPolicy, self).__init__()
self.encoder = VectorEncoder(input_size, h_size, num_layers)
self.distribution = GaussianDistribution(h_size, act_size)
self.critic = Critic(
stream_names, h_size, VectorEncoder(input_size, h_size, num_layers)
)
self.act_size = act_size
self.normalize = normalize
self.normalizer = Normalizer(input_size)
def forward(self, inputs):
if self.normalize:
inputs = self.normalizer(inputs)
_hidden = self.encoder(inputs)
# epsilon = np.random.normal(size=(input.shape[0], self.act_size))
dist = self.distribution(_hidden)
# raw_action = dist.sample()
# action = tf.clip_by_value(raw_action, -3, 3) / 3
# log_prob = dist.log_prob(raw_action)
# entropy = dist.entropy()
return dist
def update_normalization(self, inputs):
if self.normalize:
self.normalizer.update(inputs)
def get_values(self, inputs):
if self.normalize:
inputs = self.normalizer(inputs)
return self.critic(inputs)
class PPOPolicy(object):
def __init__(
self,
seed: int,

:param is_training: Whether the model should be trained.
:param load: Whether a pre-trained model will be loaded or a new one created.
"""
super().__init__(seed, brain, trainer_params)
reward_signal_configs = trainer_params["reward_signals"]
# super().__init__(seed, brain, trainer_params)
# TF defaults to 32-bit, so we use the same here.
torch.set_default_tensor_type(torch.DoubleTensor)
reward_signal_configs = trainer_params["reward_signals"]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",

brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.brain = brain
self.trainer_params = trainer_params
self.optimizer = torch.optim.Adam(
self.model.parameters(), lr=self.trainer_params["learning_rate"]
)
self.sequence_length = (
1
if not self.trainer_params["use_recurrent"]
else self.trainer_params["sequence_length"]
)
self.global_step = torch.tensor(0)
with self.graph.as_default():
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "pretraining" in trainer_params:
BCModule.check_config(trainer_params["pretraining"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=trainer_params["num_epoch"],
**trainer_params["pretraining"],
)
if load:
self._load_graph()
else:
self._initialize_graph()
def create_model(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):

:param reward_signal_configs: Reward signal config
:param seed: Random seed.
"""
with self.graph.as_default():
self.model = PPOModel(
brain=brain,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=trainer_params["normalize"],
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
self.model = ActorCriticPolicy(
h_size=int(trainer_params["hidden_units"]),
input_size=brain.vector_observation_space_size,
act_size=sum(brain.vector_action_space_size),
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
def ppo_value_loss(self, values, old_values, returns):
"""
Creates training-specific Tensorflow ops for PPO models.
:param probs: Current policy probabilities
:param old_probs: Past policy probabilities
:param value_heads: Value estimate tensors from each value stream
:param beta: Entropy regularization strength
:param entropy: Current policy entropy
:param epsilon: Value for policy-divergence threshold
:param lr: Learning rate
:param max_step: Total number of training steps.
"""
decay_epsilon = self.trainer_params["epsilon"]
value_losses = []
for name, head in values.items():
old_val_tensor = torch.DoubleTensor(old_values[name])
clipped_value_estimate = old_val_tensor + torch.clamp(
torch.sum(head, dim=1) - old_val_tensor, -decay_epsilon, decay_epsilon
self.model.create_ppo_optimizer()
v_opt_a = (torch.DoubleTensor(returns[name]) - torch.sum(head, dim=1)) ** 2
v_opt_b = (torch.DoubleTensor(returns[name]) - clipped_value_estimate) ** 2
value_loss = torch.mean(torch.max(v_opt_a, v_opt_b))
value_losses.append(value_loss)
value_loss = torch.mean(torch.stack(value_losses))
return value_loss
def ppo_policy_loss(self, advantages, probs, old_probs, masks, epsilon):
"""
Creates training-specific Tensorflow ops for PPO models.
:param probs: Current policy probabilities
:param old_probs: Past policy probabilities
:param value_heads: Value estimate tensors from each value stream
:param beta: Entropy regularization strength
:param entropy: Current policy entropy
:param epsilon: Value for policy-divergence threshold
:param lr: Learning rate
:param max_step: Total number of training steps.
"""
advantage = torch.from_numpy(np.expand_dims(advantages, -1))
self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
decay_epsilon = self.trainer_params["epsilon"]
self.total_policy_loss = self.model.abs_policy_loss
self.update_dict.update(
{
"value_loss": self.model.value_loss,
"policy_loss": self.total_policy_loss,
"update_batch": self.model.update_batch,
}
r_theta = torch.exp(probs - torch.DoubleTensor(old_probs))
p_opt_a = r_theta * advantage
p_opt_b = (
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage
# print(tf.reduce_mean(p_opt_a), tf.reduce_mean(p_opt_b))
policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b))
# For cleaner stats reporting
# abs_policy_loss = tf.abs(policy_loss)
return policy_loss
def create_reward_signals(self, reward_signal_configs):
"""

self.reward_signals = {}
with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
# with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def execute_model(self, observations):
action_dist = self.model(observations)
action = action_dist.sample()
log_probs = action_dist.log_prob(action)
entropy = action_dist.entropy()
value_heads = self.model.get_values(observations)
return action, log_probs, entropy, value_heads
@timed
def evaluate(self, brain_info):

:return: Outputs from network as defined by self.inference_dict.
"""
feed_dict = {
self.model.batch_size: len(brain_info.vector_observations),
self.model.sequence_length: 1,
run_out = {}
action, log_probs, entropy, value_heads = self.execute_model(
brain_info.vector_observations
)
run_out["action"] = np.array(action.detach())
run_out["log_probs"] = np.array(log_probs.detach())
run_out["entropy"] = np.array(entropy.detach())
run_out["value_heads"] = {
name: np.array(t.detach()) for name, t in value_heads.items()
epsilon = None
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[
self.model.prev_action
] = brain_info.previous_vector_actions.reshape(
[-1, len(self.model.act_size)]
)
feed_dict[self.model.memory_in] = self.retrieve_memories(brain_info.agents)
if self.use_continuous_act:
epsilon = np.random.normal(
size=(len(brain_info.vector_observations), self.model.act_size[0])
)
feed_dict[self.model.epsilon] = epsilon
feed_dict = self.fill_eval_dict(feed_dict, brain_info)
run_out = self._execute_model(feed_dict, self.inference_dict)
if self.use_continuous_act:
run_out["random_normal_epsilon"] = epsilon
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0)
run_out["learning_rate"] = 0.0
self.model.update_normalization(brain_info.vector_observations)
def get_action(self, brain_info: BrainInfo) -> ActionInfo:
"""
Decides actions given observations information, and takes them in environment.
:param brain_info: A dictionary of brain names and BrainInfo from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""
if len(brain_info.agents) == 0:
return ActionInfo([], [], None)
run_out = self.evaluate(brain_info) # pylint: disable=assignment-from-no-return
return ActionInfo(
action=run_out.get("action"), value=run_out.get("value"), outputs=run_out
)
@timed
def update(self, mini_batch, num_sequences):
"""

:return: Results of update.
"""
feed_dict = self.construct_feed_dict(self.model, mini_batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():
feed_dict.update(
reward_signal.prepare_update(self.model, mini_batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
returns = {}
old_values = {}
for name in self.reward_signals:
returns[name] = mini_batch["{}_returns".format(name)]
old_values[name] = mini_batch["{}_value_estimates".format(name)]
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
return update_stats
obs = np.array(mini_batch["vector_obs"])
values = self.model.get_values(obs)
dist = self.model(obs)
probs = dist.log_prob(torch.from_numpy(np.array(mini_batch["actions"])))
entropy = dist.entropy()
value_loss = self.ppo_value_loss(values, old_values, returns)
policy_loss = self.ppo_policy_loss(
np.array(mini_batch["advantages"]),
probs,
np.array(mini_batch["action_probs"]),
np.array(mini_batch["masks"], dtype=np.uint32),
1e-3,
)
loss = (
policy_loss
+ 0.5 * value_loss
- self.trainer_params["beta"] * torch.mean(entropy)
)
self.optimizer.zero_grad()
loss.backward()
def construct_feed_dict(self, model, mini_batch, num_sequences):
feed_dict = {
model.batch_size: num_sequences,
model.sequence_length: self.sequence_length,
model.mask_input: mini_batch["masks"],
model.advantage: mini_batch["advantages"],
model.all_old_log_probs: mini_batch["action_probs"],
}
for name in self.reward_signals:
feed_dict[model.returns_holders[name]] = mini_batch[
"{}_returns".format(name)
]
feed_dict[model.old_values[name]] = mini_batch[
"{}_value_estimates".format(name)
]
self.optimizer.step()
update_stats = {}
update_stats["Losses/Policy Loss"] = abs(policy_loss.detach().numpy())
update_stats["Losses/Value Loss"] = value_loss.detach().numpy()
if self.use_continuous_act:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
else:
feed_dict[model.action_holder] = mini_batch["actions"]
if self.use_recurrent:
feed_dict[model.prev_action] = mini_batch["prev_action"]
feed_dict[model.action_masks] = mini_batch["action_mask"]
if self.use_vec_obs:
feed_dict[model.vector_in] = mini_batch["vector_obs"]
if self.model.vis_obs_size > 0:
for i, _ in enumerate(self.model.visual_in):
feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.use_recurrent:
mem_in = [
mini_batch["memory"][i]
for i in range(0, len(mini_batch["memory"]), self.sequence_length)
]
feed_dict[model.memory_in] = mem_in
return feed_dict
return update_stats
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool

:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [
brain_info.visual_observations[i][idx]
]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
if self.use_recurrent:
feed_dict[self.model.memory_in] = self.retrieve_memories([idx])
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = [
brain_info.previous_vector_actions[idx]
]
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = self.model.get_values(
np.expand_dims(brain_info.vector_observations[idx], 0)
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}

value_estimates[k] = 0.0
return value_estimates
@property
def vis_obs_size(self):
return self.brain.number_visual_observations
@property
def vec_obs_size(self):
return self.brain.vector_observation_space_size
@property
def use_vis_obs(self):
return self.vis_obs_size > 0
@property
def use_vec_obs(self):
return self.vec_obs_size > 0
@property
def use_recurrent(self):
return False
@property
def use_continuous_act(self):
return True
def get_current_step(self):
"""
Gets current model step.
:return: current model step.
"""
step = self.global_step.detach().numpy()
return step
def increment_step(self, n_steps):
"""
Increments model step.
"""
self.global_step = self.global_step + n_steps
return self.get_current_step()
def save_model(self, step):
pass
def export_model(self):
pass

6
test_constraints_max_tf1_version.txt


# pip constraints to use the *highest* versions allowed in ml-agents/setup.py
# with the exception of tensorflow, which is constrained to <2
# For projects with upper bounds, we should periodically update this list to the latest release version
grpcio>=1.23.0
numpy>=1.17.2
tensorflow>=1.14.0,<2.0

273
ml-agents/mlagents/trainers/ppo/policy_old.py


import logging
import numpy as np
from typing import Any, Dict, Optional
import tensorflow as tf
from mlagents.envs.timers import timed
from mlagents.envs.brain import BrainInfo, BrainParameters
from mlagents.trainers.models import EncoderType, LearningRateSchedule
from mlagents.trainers.ppo.models import PPOModel
from mlagents.trainers.tf_policy import TFPolicy
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
from mlagents.trainers.components.bc.module import BCModule
logger = logging.getLogger("mlagents.trainers")
class PPOPolicy(TFPolicy):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_params: Dict[str, Any],
is_training: bool,
load: bool,
):
"""
Policy for Proximal Policy Optimization Networks.
:param seed: Random seed.
:param brain: Assigned Brain object.
:param trainer_params: Defined training parameters.
:param is_training: Whether the model should be trained.
:param load: Whether a pre-trained model will be loaded or a new one created.
"""
super().__init__(seed, brain, trainer_params)
reward_signal_configs = trainer_params["reward_signals"]
self.inference_dict: Dict[str, tf.Tensor] = {}
self.update_dict: Dict[str, tf.Tensor] = {}
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.create_model(
brain, trainer_params, reward_signal_configs, is_training, load, seed
)
self.create_reward_signals(reward_signal_configs)
self.trainer_params = trainer_params
with self.graph.as_default():
self.bc_module: Optional[BCModule] = None
# Create pretrainer if needed
if "pretraining" in trainer_params:
BCModule.check_config(trainer_params["pretraining"])
self.bc_module = BCModule(
self,
policy_learning_rate=trainer_params["learning_rate"],
default_batch_size=trainer_params["batch_size"],
default_num_epoch=trainer_params["num_epoch"],
**trainer_params["pretraining"],
)
if load:
self._load_graph()
else:
self._initialize_graph()
def create_model(
self, brain, trainer_params, reward_signal_configs, is_training, load, seed
):
"""
Create PPO model
:param brain: Assigned Brain object.
:param trainer_params: Defined training parameters.
:param reward_signal_configs: Reward signal config
:param seed: Random seed.
"""
with self.graph.as_default():
self.model = PPOModel(
brain=brain,
lr=float(trainer_params["learning_rate"]),
lr_schedule=LearningRateSchedule(
trainer_params.get("learning_rate_schedule", "linear")
),
h_size=int(trainer_params["hidden_units"]),
epsilon=float(trainer_params["epsilon"]),
beta=float(trainer_params["beta"]),
max_step=float(trainer_params["max_steps"]),
normalize=trainer_params["normalize"],
use_recurrent=trainer_params["use_recurrent"],
num_layers=int(trainer_params["num_layers"]),
m_size=self.m_size,
seed=seed,
stream_names=list(reward_signal_configs.keys()),
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
self.model.create_ppo_optimizer()
self.inference_dict.update(
{
"action": self.model.output,
"log_probs": self.model.all_log_probs,
"value_heads": self.model.value_heads,
"value": self.model.value,
"entropy": self.model.entropy,
"learning_rate": self.model.learning_rate,
}
)
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.model.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.model.memory_out
self.total_policy_loss = self.model.abs_policy_loss
self.update_dict.update(
{
"value_loss": self.model.value_loss,
"policy_loss": self.total_policy_loss,
"update_batch": self.model.update_batch,
}
)
def create_reward_signals(self, reward_signal_configs):
"""
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
with self.graph.as_default():
# Create reward signals
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self, self.model, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
@timed
def evaluate(self, brain_info):
"""
Evaluates policy for the agent experiences provided.
:param brain_info: BrainInfo object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
feed_dict = {
self.model.batch_size: len(brain_info.vector_observations),
self.model.sequence_length: 1,
}
epsilon = None
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[
self.model.prev_action
] = brain_info.previous_vector_actions.reshape(
[-1, len(self.model.act_size)]
)
if brain_info.memories.shape[1] == 0:
brain_info.memories = self.make_empty_memory(len(brain_info.agents))
feed_dict[self.model.memory_in] = brain_info.memories
if self.use_continuous_act:
epsilon = np.random.normal(
size=(len(brain_info.vector_observations), self.model.act_size[0])
)
feed_dict[self.model.epsilon] = epsilon
feed_dict = self.fill_eval_dict(feed_dict, brain_info)
run_out = self._execute_model(feed_dict, self.inference_dict)
if self.use_continuous_act:
run_out["random_normal_epsilon"] = epsilon
return run_out
@timed
def update(self, mini_batch, num_sequences):
"""
Performs update on model.
:param mini_batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
feed_dict = self.construct_feed_dict(self.model, mini_batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():
feed_dict.update(
reward_signal.prepare_update(self.model, mini_batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
update_vals = self._execute_model(feed_dict, self.update_dict)
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
return update_stats
def construct_feed_dict(self, model, mini_batch, num_sequences):
feed_dict = {
model.batch_size: num_sequences,
model.sequence_length: self.sequence_length,
model.mask_input: mini_batch["masks"],
model.advantage: mini_batch["advantages"],
model.all_old_log_probs: mini_batch["action_probs"],
}
for name in self.reward_signals:
feed_dict[model.returns_holders[name]] = mini_batch[
"{}_returns".format(name)
]
feed_dict[model.old_values[name]] = mini_batch[
"{}_value_estimates".format(name)
]
if self.use_continuous_act:
feed_dict[model.output_pre] = mini_batch["actions_pre"]
feed_dict[model.epsilon] = mini_batch["random_normal_epsilon"]
else:
feed_dict[model.action_holder] = mini_batch["actions"]
if self.use_recurrent:
feed_dict[model.prev_action] = mini_batch["prev_action"]
feed_dict[model.action_masks] = mini_batch["action_mask"]
if self.use_vec_obs:
feed_dict[model.vector_in] = mini_batch["vector_obs"]
if self.model.vis_obs_size > 0:
for i, _ in enumerate(self.model.visual_in):
feed_dict[model.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.use_recurrent:
mem_in = [
mini_batch["memory"][i]
for i in range(0, len(mini_batch["memory"]), self.sequence_length)
]
feed_dict[model.memory_in] = mem_in
return feed_dict
def get_value_estimates(
self, brain_info: BrainInfo, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param brain_info: BrainInfo to be used for bootstrapping.
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
feed_dict: Dict[tf.Tensor, Any] = {
self.model.batch_size: 1,
self.model.sequence_length: 1,
}
for i in range(len(brain_info.visual_observations)):
feed_dict[self.model.visual_in[i]] = [
brain_info.visual_observations[i][idx]
]
if self.use_vec_obs:
feed_dict[self.model.vector_in] = [brain_info.vector_observations[idx]]
if self.use_recurrent:
if brain_info.memories.shape[1] == 0:
brain_info.memories = self.make_empty_memory(len(brain_info.agents))
feed_dict[self.model.memory_in] = [brain_info.memories[idx]]
if not self.use_continuous_act and self.use_recurrent:
feed_dict[self.model.prev_action] = [
brain_info.previous_vector_actions[idx]
]
value_estimates = self.sess.run(self.model.value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates

2
ml-agents/mlagents/tf_utils/__init__.py


from mlagents.tf_utils.tf import tf as tf # noqa
from mlagents.tf_utils.tf import tf_flatten, tf_rnn, tf_variance_scaling # noqa

26
ml-agents/mlagents/tf_utils/tf.py


# This should be the only place that we import tensorflow directly.
# Everywhere else is caught by the banned-modules setting for flake8
import tensorflow as tf # noqa I201
from distutils.version import LooseVersion
# LooseVersion handles things "1.2.3a" or "4.5.6-rc7" fairly sensibly.
_is_tensorflow2 = LooseVersion(tf.__version__) >= LooseVersion("2.0.0")
# A few things that we use live in different places between tensorflow 1.x and 2.x
# If anything new is added, please add it here
if _is_tensorflow2:
import tensorflow.compat.v1 as tf
tf_variance_scaling = tf.initializers.variance_scaling
tf_flatten = tf.layers.flatten
tf_rnn = tf.nn.rnn_cell
# tf.disable_v2_behavior()
else:
import tensorflow.contrib.layers as c_layers
tf_variance_scaling = c_layers.variance_scaling_initializer
tf_flatten = c_layers.flatten
tf_rnn = tf.contrib.rnn

/test_constraints_max_version.txt → /test_constraints_max_tf2_version.txt

正在加载...
取消
保存