比较提交

...
此合并请求有变更与目标分支冲突。
/config/ppo/3DBallHard.yaml
/Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
/ml-agents/mlagents/trainers/settings.py
/ml-agents/mlagents/trainers/trainer_util.py
/Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab

1 次代码提交

作者 SHA1 备注 提交日期
Andrew Cohen b37c4493 floor material 4 年前
共有 19 个文件被更改,包括 2085 次插入11 次删除
  1. 2
      config/ppo/3DBallHard.yaml
  2. 11
      ml-agents/mlagents/trainers/trainer_util.py
  3. 4
      ml-agents/mlagents/trainers/settings.py
  4. 8
      Project/ProjectSettings/EditorBuildSettings.asset
  5. 2
      Project/ProjectSettings/UnityConnectSettings.asset
  6. 4
      Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs
  7. 10
      Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab
  8. 920
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  9. 8
      Project/Assets/ML-Agents/Examples/Crawler/Physics_materials.meta
  10. 26
      config/ppo_transfer/3DBall.yaml
  11. 26
      config/ppo_transfer/3DBallHard.yaml
  12. 0
      ml-agents/mlagents/trainers/ppo_transfer/__init__.py
  13. 703
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  14. 350
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  15. 14
      Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial
  16. 8
      Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial.meta

2
config/ppo/3DBallHard.yaml


gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 5000000
max_steps: 4000000
time_horizon: 1000
summary_freq: 12000
threaded: true

11
ml-agents/mlagents/trainers/trainer_util.py


from mlagents.trainers.trainer import Trainer
from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.ppo.trainer import PPOTrainer
from mlagents.trainers.ppo_transfer.trainer import PPOTransferTrainer
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController

)
elif trainer_type == TrainerType.SAC:
trainer = SACTrainer(
brain_name,
min_lesson_length,
trainer_settings,
train_model,
load_model,
seed,
trainer_artifact_path,
)
elif trainer_type == TrainerType.PPO_Transfer:
trainer = PPOTransferTrainer(
brain_name,
min_lesson_length,
trainer_settings,

4
ml-agents/mlagents/trainers/settings.py


class TrainerType(Enum):
PPO: str = "ppo"
SAC: str = "sac"
PPO_Transfer: str = "ppo_transfer"
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings}
_mapping = {TrainerType.PPO: PPOSettings, TrainerType.SAC: SACSettings,
TrainerType.PPO_Transfer: PPOSettings}
return _mapping[self]

8
Project/ProjectSettings/EditorBuildSettings.asset


EditorBuildSettings:
m_ObjectHideFlags: 0
serializedVersion: 2
m_Scenes: []
m_Scenes:
- enabled: 0
path: Assets/ML-Agents/Examples/3DBall/Scenes/3DBallHard.unity
guid: 35c41099ceec44889bdbe95ed86c97ac
- enabled: 1
path: Assets/ML-Agents/Examples/3DBall/Scenes/3DBall.unity
guid: b9ac0cbf961bf4dacbfa0aa9c0d60aaa
m_configObjects: {}

2
Project/ProjectSettings/UnityConnectSettings.asset


UnityConnectSettings:
m_ObjectHideFlags: 0
serializedVersion: 1
m_Enabled: 1
m_Enabled: 0
m_TestMode: 0
m_EventOldUrl: https://api.uca.cloud.unity3d.com/v1/events
m_EventUrl: https://cdp.cloud.unity3d.com/v1/events

4
Project/Assets/ML-Agents/Examples/3DBall/Scripts/Ball3DAgent.cs


public override void OnActionReceived(float[] vectorAction)
{
var actionZ = 2f * Mathf.Clamp(vectorAction[0], -1f, 1f);
var actionX = 2f * Mathf.Clamp(vectorAction[1], -1f, 1f);
var actionZ = 1f * Mathf.Clamp(vectorAction[0], -1f, 1f);
var actionX = 1f * Mathf.Clamp(vectorAction[1], -1f, 1f);
if ((gameObject.transform.rotation.z < 0.25f && actionZ > 0f) ||
(gameObject.transform.rotation.z > -0.25f && actionZ < 0f))

10
Project/Assets/ML-Agents/Examples/Crawler/Prefabs/FixedPlatform.prefab


m_LocalPosition: {x: 200, y: 0, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_Children:
- {fileID: 3386028169429758297}
- {fileID: 3386028169429758297}
m_Father: {fileID: 0}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}

m_LocalScale: {x: 1, y: 1, z: 1}
m_Children: []
m_Father: {fileID: 4309919623019186}
m_RootOrder: 2
m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!33 &33357510309310810
MeshFilter:

m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1846708386698568}
m_Material: {fileID: 0}
m_Material: {fileID: 13400000, guid: dff6e5680d76643a481e8d81555ef3ee, type: 2}
m_IsTrigger: 0
m_Enabled: 1
serializedVersion: 2

- {fileID: 4856650706546504}
- {fileID: 4791482523457020}
m_Father: {fileID: 4309919623019186}
m_RootOrder: 1
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
--- !u!1 &1995322274649904
GameObject:

- target: {fileID: 4845971001715176651, guid: 0456c89e8c9c243d595b039fe7aa0bf9,
type: 3}
propertyPath: m_RootOrder
value: 0
value: 2
objectReference: {fileID: 0}
- target: {fileID: 4845971001715176651, guid: 0456c89e8c9c243d595b039fe7aa0bf9,
type: 3}

920
ml-agents/mlagents/trainers/policy/transfer_policy.py


import os
from typing import Any, Dict, Optional, List, Tuple
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models import EncoderType
from mlagents.trainers.models import ModelUtils
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.distributions import (
GaussianDistribution,
MultiCategoricalDistribution,
)
import tf_slim as slim
EPSILON = 1e-6 # Small value to avoid divide by zero
class GaussianEncoderDistribution:
def __init__(
self,
encoded: tf.Tensor,
feature_size: int
):
self.mu = tf.layers.dense(
encoded,
feature_size,
activation=None,
name="mu",
kernel_initializer=ModelUtils.scaled_init(0.01),
reuse=tf.AUTO_REUSE,
)
self.log_sigma = tf.layers.dense(
encoded,
feature_size,
activation=None,
name="log_std",
kernel_initializer=ModelUtils.scaled_init(0.01),
)
self.sigma = tf.exp(self.log_sigma)
def sample(self):
epsilon = tf.random_normal(tf.shape(self.mu))
sampled = self.mu + self.sigma * epsilon
return sampled
def kl_standard(self):
"""
KL divergence with a standard gaussian
"""
kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1)
return kl
class TransferPolicy(TFPolicy):
def __init__(
self,
seed: int,
brain: BrainParameters,
trainer_params: TrainerSettings,
is_training: bool,
model_path: str,
load: bool,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
create_tf_graph: bool = True,
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous action spaces, as well as recurrent networks.
:param seed: Random seed.
:param brain: Assigned BrainParameters object.
:param trainer_params: Defined training parameters.
:param is_training: Whether the model should be trained.
:param load: Whether a pre-trained model will be loaded or a new one created.
:param model_path: Path where the model should be saved and loaded.
:param tanh_squash: Whether to use a tanh function on the continuous output, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy in continuous output.
"""
super().__init__(seed, brain, trainer_params, model_path, load)
self.grads = None
self.update_batch: Optional[tf.Operation] = None
num_layers = self.network_settings.num_layers
self.h_size = self.network_settings.hidden_units
if num_layers < 1:
num_layers = 1
self.num_layers = num_layers
self.vis_encode_type = self.network_settings.vis_encode_type
self.tanh_squash = tanh_squash
self.reparameterize = reparameterize
self.condition_sigma_on_obs = condition_sigma_on_obs
self.trainable_variables: List[tf.Variable] = []
self.encoder = None
self.encoder_distribution = None
self.targ_encoder = None
# Model-based learning
self.feature_size = 16 # dimension of latent feature size
# Non-exposed parameters; these aren't exposed because they don't have a
# good explanation and usually shouldn't be touched.
self.log_std_min = -20
self.log_std_max = 2
if create_tf_graph:
self.create_tf_graph()
def get_trainable_variables(self) -> List[tf.Variable]:
"""
Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
returns empty list.
"""
return self.trainable_variables
def create_tf_graph(self,
encoder_layers = 1,
policy_layers = 1,
policy_units = 128,
transfer=False,
separate_train=False,
var_encoder=False,
var_predict=False,
predict_return=False,
inverse_model=False
) -> None:
"""
Builds the tensorflow graph needed for this policy.
"""
self.inverse_model = inverse_model
with self.graph.as_default():
tf.set_random_seed(self.seed)
_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
if len(_vars) > 0:
# We assume the first thing created in the graph is the Policy. If
# already populated, don't create more tensors.
return
self.create_input_placeholders()
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
self.next_visual_in: List[tf.Tensor] = []
with tf.variable_scope("encoding"):
self.encoder, self.targ_encoder = self.create_encoders()
with tf.variable_scope("inverse"):
self.create_inverse_model(self.encoder, self.targ_encoder)
with tf.variable_scope("predict"):
self.create_forward_model(self.encoder, self.targ_encoder)
# if var_encoder:
# self.encoder_distribution, self.encoder = self._create_var_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# _, self.targ_encoder = self._create_var_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# else:
# self.encoder = self._create_encoder(
# self.visual_in,
# self.processed_vector_in,
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# self.targ_encoder = self._create_target_encoder(
# self.h_size,
# self.feature_size,
# encoder_layers,
# self.vis_encode_type
# )
# self._create_hard_copy()
# if var_predict:
# self.predict_distribution, self.predict = self._create_var_world_model(
# self.encoder,
# self.h_size,
# self.feature_size,
# self.num_layers,
# self.vis_encode_type,
# predict_return
# )
# else:
# self.predict = self._create_world_model(
# self.encoder,
# self.h_size,
# self.feature_size,
# self.num_layers,
# self.vis_encode_type,
# predict_return
# )
# if inverse_model:
# self._create_inverse_model(self.encoder, self.targ_encoder)
if self.use_continuous_act:
self._create_cc_actor(
self.encoder,
policy_units,
policy_layers,
self.tanh_squash,
self.reparameterize,
self.condition_sigma_on_obs,
separate_train
)
else:
self._create_dc_actor(self.encoder, policy_units, policy_layers, separate_train)
self.trainable_variables = tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="encoding"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="predict"
)
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
) # LSTMs need to be root scope for Barracuda export
if not transfer:
self.trainable_variables += tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope="inverse"
)
self.inference_dict: Dict[str, tf.Tensor] = {
"action": self.output,
"log_probs": self.all_log_probs,
"entropy": self.entropy,
}
if self.use_continuous_act:
self.inference_dict["pre_action"] = self.output_pre
if self.use_recurrent:
self.inference_dict["memory_out"] = self.memory_out
# We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
# it will re-load the full graph
self._initialize_graph()
# slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)
def load_graph_partial(self, path: str, transfer_type="dynamics"):
load_nets = {"dynamics": ["policy", "predict", "value"],
"observation": ["encoding", "inverse"]}
if self.inverse_model:
load_nets["dynamics"].append("inverse")
with self.graph.as_default():
for net in load_nets[transfer_type]:
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net)
partial_saver = tf.train.Saver(variables_to_restore)
partial_model_checkpoint = os.path.join(path, f"{net}.ckpt")
partial_saver.restore(self.sess, partial_model_checkpoint)
print("loaded net", net, "from path", path)
# variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
# partial_saver = tf.train.Saver(variables_to_restore)
# partial_model_checkpoint = os.path.join(path, f"latent.ckpt")
# partial_saver.restore(self.sess, partial_model_checkpoint)
# print("loaded net latent from path", path)
if transfer_type == "observation":
self.run_hard_copy()
def _create_world_model(
self,
encoder: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
predict_return: bool=False
) -> tf.Tensor:
""""
Builds the world model for state prediction
"""
with self.graph.as_default():
with tf.variable_scope("predict"):
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"
)
hidden_stream = ModelUtils.create_vector_observation_encoder(
tf.concat([encoder, self.current_action], axis=1),
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
if predict_return:
predict = tf.layers.dense(
hidden_stream,
feature_size+1,
name="next_state"
)
else:
predict = tf.layers.dense(
hidden_stream,
feature_size,
name="next_state"
)
return predict
def _create_var_world_model(
self,
encoder: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
predict_return: bool=False
) -> tf.Tensor:
""""
Builds the world model for state prediction
"""
with self.graph.as_default():
with tf.variable_scope("predict"):
hidden_stream = ModelUtils.create_vector_observation_encoder(
tf.concat([encoder, self.current_action], axis=1),
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
with tf.variable_scope("latent"):
if predict_return:
predict_distribution = GaussianEncoderDistribution(
hidden_stream,
feature_size+1
)
else:
predict_distribution = GaussianEncoderDistribution(
hidden_stream,
feature_size
)
predict = predict_distribution.sample()
return predict_distribution, predict
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
) -> Dict[str, Any]:
"""
Evaluates policy for the agent experiences provided.
:param decision_requests: DecisionSteps object containing inputs.
:param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
:return: Outputs from network as defined by self.inference_dict.
"""
feed_dict = {
self.batch_size_ph: len(decision_requests),
self.sequence_length_ph: 1,
}
if self.use_recurrent:
if not self.use_continuous_act:
feed_dict[self.prev_action] = self.retrieve_previous_action(
global_agent_ids
)
feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
run_out = self._execute_model(feed_dict, self.inference_dict)
return run_out
def _create_target_encoder(
self,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.vector_next,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.vector_next
with tf.variable_scope("target_enc"):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
latent_targ = tf.layers.dense(
hidden_stream_targ,
feature_size,
name="latent"
)
return tf.stop_gradient(latent_targ)
def _create_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
"""
Creates an encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("encoding"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in,
vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
latent = tf.layers.dense(
hidden_stream,
feature_size,
name="latent"
)
return latent
def _create_var_target_encoder(
self,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType,
) -> tf.Tensor:
self.visual_next = ModelUtils.create_visual_input_placeholders(
self.brain.camera_resolutions
)
self.vector_next = ModelUtils.create_vector_input(self.vec_obs_size)
if self.normalize:
self.processed_vector_next = ModelUtils.normalize_vector_obs(
self.vector_next,
self.running_mean,
self.running_variance,
self.normalization_steps,
)
else:
self.processed_vector_next = self.vector_next
with tf.variable_scope("target_enc"):
hidden_stream_targ = ModelUtils.create_observation_streams(
self.visual_next,
self.processed_vector_next,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
with tf.variable_scope("latent"):
latent_targ_distribution = GaussianEncoderDistribution(
hidden_stream_targ,
feature_size
)
latent_targ = latent_targ_distribution.sample()
return latent_targ_distribution, tf.stop_gradient(latent_targ)
def _create_var_encoder(
self,
visual_in: List[tf.Tensor],
vector_in: tf.Tensor,
h_size: int,
feature_size: int,
num_layers: int,
vis_encode_type: EncoderType
) -> tf.Tensor:
"""
Creates a variational encoder for visual and vector observations.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:return: The hidden layer (tf.Tensor) after the encoder.
"""
with tf.variable_scope("encoding"):
hidden_stream = ModelUtils.create_observation_streams(
visual_in,
vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
with tf.variable_scope("latent"):
latent_distribution = GaussianEncoderDistribution(
hidden_stream,
feature_size
)
latent = latent_distribution.sample()
return latent_distribution, latent
def _create_hard_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
def run_hard_copy(self):
self.sess.run(self.target_replace_op)
def _create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
"""
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
with tf.variable_scope("inverse"):
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
hidden = tf.layers.dense(combined_input, self.h_size, activation=ModelUtils.swish)
if self.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(
hidden, self.act_size[0], activation=None
)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.current_action), axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
else:
pred_action = tf.concat(
[
tf.layers.dense(
hidden, self.act_size[i], activation=tf.nn.softmax
)
for i in range(len(self.act_size))
],
axis=1,
)
cross_entropy = tf.reduce_sum(
-tf.log(pred_action + 1e-10) * self.current_action, axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
)
def _create_cc_actor(
self,
encoded: tf.Tensor,
h_size: int,
num_layers: int,
tanh_squash: bool = False,
reparameterize: bool = False,
condition_sigma_on_obs: bool = True,
separate_train: bool = False
) -> None:
"""
Creates Continuous control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
:param tanh_squash: Whether to use a tanh function, or a clipped output.
:param reparameterize: Whether we are using the resampling trick to update the policy.
"""
if self.use_recurrent:
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
)
self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
else:
hidden_policy = encoded
if separate_train:
hidden_policy = tf.stop_gradient(hidden_policy)
with tf.variable_scope("policy"):
hidden_policy = ModelUtils.create_vector_observation_encoder(
hidden_policy,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False,
)
distribution = GaussianDistribution(
hidden_policy,
self.act_size,
reparameterize=reparameterize,
tanh_squash=tanh_squash,
condition_sigma=condition_sigma_on_obs,
)
if tanh_squash:
self.output_pre = distribution.sample
self.output = tf.identity(self.output_pre, name="action")
else:
self.output_pre = distribution.sample
# Clip and scale output to ensure actions are always within [-1, 1] range.
output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
self.output = tf.identity(output_post, name="action")
self.selected_actions = tf.stop_gradient(self.output)
self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
self.entropy = distribution.entropy
# We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(
self,
encoded: tf.Tensor,
h_size: int,
num_layers: int,
separate_train: bool = False
) -> None:
"""
Creates Discrete control actor-critic model.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: Type of visual encoder to use if visual input.
"""
if self.use_recurrent:
self.prev_action = tf.placeholder(
shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
)
prev_action_oh = tf.concat(
[
tf.one_hot(self.prev_action[:, i], self.act_size[i])
for i in range(len(self.act_size))
],
axis=1,
)
hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
self.memory_in = tf.placeholder(
shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
)
hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
hidden_policy,
self.memory_in,
self.sequence_length_ph,
name="lstm_policy",
)
self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
else:
hidden_policy = encoded
if separate_train:
hidden_policy = tf.stop_gradient(hidden_policy)
self.action_masks = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
)
with tf.variable_scope("policy"):
hidden_policy = ModelUtils.create_vector_observation_encoder(
hidden_policy,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False,
)
distribution = MultiCategoricalDistribution(
hidden_policy, self.act_size, self.action_masks
)
# It's important that we are able to feed_dict a value into this tensor to get the
# right one-hot encoding, so we can't do identity on it.
self.output = distribution.sample
self.all_log_probs = tf.identity(distribution.log_probs, name="action")
self.selected_actions = tf.stop_gradient(
distribution.sample_onehot
) # In discrete, these are onehot
self.entropy = distribution.entropy
self.total_log_probs = distribution.total_log_probs
def save_model(self, steps):
"""
Saves the model
:param steps: The number of steps the model was trained for
:return:
"""
with self.graph.as_default():
last_checkpoint = os.path.join(self.model_path, f"model-{steps}.ckpt")
self.saver.save(self.sess, last_checkpoint)
tf.train.write_graph(
self.graph, self.model_path, "raw_graph_def.pb", as_text=False
)
# save each net separately
policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
policy_saver = tf.train.Saver(policy_vars)
policy_checkpoint = os.path.join(self.model_path, f"policy.ckpt")
policy_saver.save(self.sess, policy_checkpoint)
encoding_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
encoding_saver = tf.train.Saver(encoding_vars)
encoding_checkpoint = os.path.join(self.model_path, f"encoding.ckpt")
encoding_saver.save(self.sess, encoding_checkpoint)
# latent_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
# latent_saver = tf.train.Saver(latent_vars)
# latent_checkpoint = os.path.join(self.model_path, f"latent.ckpt")
# latent_saver.save(self.sess, latent_checkpoint)
predict_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
predict_saver = tf.train.Saver(predict_vars)
predict_checkpoint = os.path.join(self.model_path, f"predict.ckpt")
predict_saver.save(self.sess, predict_checkpoint)
value_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
value_saver = tf.train.Saver(value_vars)
value_checkpoint = os.path.join(self.model_path, f"value.ckpt")
value_saver.save(self.sess, value_checkpoint)
if self.inverse_model:
inverse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")
inverse_saver = tf.train.Saver(inverse_vars)
inverse_checkpoint = os.path.join(self.model_path, f"inverse.ckpt")
inverse_saver.save(self.sess, inverse_checkpoint)
def get_encoder_weights(self):
with self.graph.as_default():
enc = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "encoding/main_graph_0/hidden_0/bias:0")
targ = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_enc/main_graph_0/hidden_0/bias:0")
print("encoding:", self.sess.run(enc))
print("target:", self.sess.run(targ))
def create_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
encoded_state_list = []
encoded_next_state_list = []
if self.vis_obs_size > 0:
self.next_visual_in = []
visual_encoders = []
next_visual_encoders = []
for i in range(self.vis_obs_size):
# Create input ops for next (t+1) visual observations.
next_visual_input = ModelUtils.create_visual_input(
self.brain.camera_resolutions[i],
name="curiosity_next_visual_observation_" + str(i),
)
self.next_visual_in.append(next_visual_input)
# Create the encoder ops for current and next visual input.
# Note that these encoders are siamese.
encoded_visual = ModelUtils.create_visual_observation_encoder(
self.visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_stream_{}_visual_obs_encoder".format(i),
False,
)
encoded_next_visual = ModelUtils.create_visual_observation_encoder(
self.next_visual_in[i],
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_stream_{}_visual_obs_encoder".format(i),
True,
)
visual_encoders.append(encoded_visual)
next_visual_encoders.append(encoded_next_visual)
hidden_visual = tf.concat(visual_encoders, axis=1)
hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
encoded_state_list.append(hidden_visual)
encoded_next_state_list.append(hidden_next_visual)
if self.vec_obs_size > 0:
# Create the encoder ops for current and next vector input.
# Note that these encoders are siamese.
# Create input op for next (t+1) vector observation.
self.next_vector_in = tf.placeholder(
shape=[None, self.vec_obs_size],
dtype=tf.float32,
name="curiosity_next_vector_observation",
)
encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
self.vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_vector_obs_encoder",
False,
)
encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
self.next_vector_in,
self.h_size,
ModelUtils.swish,
self.num_layers,
"curiosity_vector_obs_encoder",
True,
)
encoded_state_list.append(encoded_vector_obs)
encoded_next_state_list.append(encoded_next_vector_obs)
encoded_state = tf.concat(encoded_state_list, axis=1)
encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
encoded_state = tf.layers.dense(
encoded_state,
self.feature_size,
name="latent"
)
encoded_next_state = tf.layers.dense(
encoded_next_state,
self.feature_size,
name="latent",
reuse=True
)
return encoded_state, encoded_next_state
def create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
"""
Creates inverse model TensorFlow ops for Curiosity module.
Predicts action taken given current and future encoded states.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
if self.brain.vector_action_space_type == "continuous":
pred_action = tf.layers.dense(
combined_input, self.act_size[0], activation=None
)
squared_difference = tf.reduce_sum(
tf.squared_difference(pred_action, self.current_action), axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)
else:
pred_action = tf.concat(
[
tf.layers.dense(
combined_input, self.act_size[i], activation=tf.nn.softmax
)
for i in range(len(self.act_size))
],
axis=1,
)
cross_entropy = tf.reduce_sum(
-tf.log(pred_action + 1e-10) * self.current_action, axis=1
)
self.inverse_loss = tf.reduce_mean(
tf.dynamic_partition(cross_entropy, self.mask, 2)[1]
)
def create_forward_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
) -> None:
"""
Creates forward model TensorFlow ops for Curiosity module.
Predicts encoded future state based on encoded current state and given action.
:param encoded_state: Tensor corresponding to encoded current state.
:param encoded_next_state: Tensor corresponding to encoded next state.
"""
combined_input = tf.concat(
[encoded_state, self.current_action], axis=1
)
# hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
predict = tf.layers.dense(
combined_input,
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
activation=None,
)
self.predict = tf.layers.dense(
predict,
self.feature_size,
name="latent"
)
squared_difference = 0.5 * tf.reduce_sum(
tf.squared_difference(self.predict, encoded_next_state), axis=1
)
self.intrinsic_reward = squared_difference
self.forward_loss = tf.reduce_mean(
tf.dynamic_partition(squared_difference, self.mask, 2)[1]
)

8
Project/Assets/ML-Agents/Examples/Crawler/Physics_materials.meta


fileFormatVersion: 2
guid: 87be98804068a4296bc57ee51587f7b7
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

26
config/ppo_transfer/3DBall.yaml


behaviors:
3DBall:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 64
buffer_size: 12000
learning_rate: 0.0003
beta: 0.001
epsilon: 0.2
lambd: 0.99
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 1
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 500000
time_horizon: 1000
summary_freq: 12000
threaded: true

26
config/ppo_transfer/3DBallHard.yaml


behaviors:
3DBallHard:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 1200
buffer_size: 12000
learning_rate: 0.0003
beta: 0.001
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 128
num_layers: 1
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 4000000
time_horizon: 1000
summary_freq: 12000
threaded: true

0
ml-agents/mlagents/trainers/ppo_transfer/__init__.py

703
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


from typing import Optional, Any, Dict, cast
import numpy as np
import os
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.transfer_policy import TransferPolicy
from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.settings import TrainerSettings, PPOSettings
import tf_slim as slim
class PPOTransferOptimizer(TFOptimizer):
def __init__(self, policy: TransferPolicy, trainer_params: TrainerSettings):
"""
Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
The PPO optimizer has a value esåtimator and a loss function.
:param policy: A TFPolicy object that will be updated by this PPO Optimizer.
:param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
"""
self.separate_value_train = False
self.separate_policy_train = False
self.use_var_encoder = False
self.use_var_predict = False
self.with_prior = False
self.use_inverse_model = True
self.predict_return = False
self.use_alter = False
self.in_batch_alter = False
self.in_epoch_alter = False
self.num_updates = 0
self.alter_every = 400
self.copy_every = 1
self.train_type = "all"
# Transfer
self.use_transfer = False
self.smart_transfer = False
self.conv_thres = 1e-6
self.old_loss = np.inf
self.update_mode = "model"
self.transfer_path = "results/BallSingle_nosep_cmodel_small/3DBall"
self.transfer_type = "dynamics"
self.ppo_update_dict: Dict[str, tf.Tensor] = {}
self.model_update_dict: Dict[str, tf.Tensor] = {}
# Create the graph here to give more granular control of the TF graph to the Optimizer.
policy.create_tf_graph(1, 1, 128, self.use_transfer, self.separate_policy_train,
self.use_var_encoder, self.use_var_predict, self.predict_return, self.use_inverse_model)
with policy.graph.as_default():
super().__init__(policy, trainer_params)
hyperparameters: PPOSettings = cast(
PPOSettings, trainer_params.hyperparameters
)
lr = float(hyperparameters.learning_rate)
self._schedule = hyperparameters.learning_rate_schedule
epsilon = float(hyperparameters.epsilon)
beta = float(hyperparameters.beta)
max_step = float(trainer_params.max_steps)
policy_network_settings = policy.network_settings
h_size = int(policy_network_settings.hidden_units)
num_layers = policy_network_settings.num_layers
vis_encode_type = policy_network_settings.vis_encode_type
self.burn_in_ratio = 0.0
self.stream_names = list(self.reward_signals.keys())
self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
self.grads = None
self.update_batch: Optional[tf.Operation] = None
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
self.memory_in = tf.placeholder(
shape=[None, self.m_size],
dtype=tf.float32,
name="recurrent_value_in",
)
if num_layers < 1:
num_layers = 1
with tf.variable_scope("value"):
if policy.use_continuous_act:
self._create_cc_critic(h_size, num_layers, vis_encode_type)
else:
self._create_dc_critic(h_size, num_layers, vis_encode_type)
with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self._create_losses(
self.policy.total_log_probs,
self.old_log_probs,
self.value_heads,
self.policy.entropy,
self.policy.targ_encoder,
self.policy.predict,
self.policy.encoder_distribution,
beta,
epsilon,
lr,
max_step,
)
self._create_ppo_optimizer_ops()
self.update_dict.update(
{
"value_loss": self.value_loss,
"policy_loss": self.abs_policy_loss,
"model_loss": self.model_loss,
"update_batch": self.update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)
if self.use_alter or self.smart_transfer or self.in_batch_alter or self.in_epoch_alter:
self._init_alter_update()
self.policy.initialize_or_load()
if self.use_transfer:
self.policy.load_graph_partial(self.transfer_path, self.transfer_type)
self.policy.get_encoder_weights()
# saver = tf.train.Saver()
# model_checkpoint = os.path.join(self.transfer_path, f"model-4000544.ckpt")
# saver.restore(self.sess, model_checkpoint)
# self.policy._set_step(0)
slim.model_analyzer.analyze_vars(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES), print_info=True)
print("All variables in the graph:")
for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
print(variable.name)
# tf.summary.FileWriter(self.policy.model_path, self.sess.graph)
def _create_cc_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
if self.separate_value_train:
input_state = tf.stop_gradient(self.policy.encoder)
else:
input_state = self.policy.encoder
hidden_value = ModelUtils.create_vector_observation_encoder(
input_state,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
self.old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
def _create_dc_critic(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
if self.separate_value_train:
input_state = tf.stop_gradient(self.policy.encoder)
else:
input_state = self.policy.encoder
hidden_value = ModelUtils.create_vector_observation_encoder(
input_state,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=False
)
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
# Break old log probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.all_old_log_probs, self.policy.act_size
)
_, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
old_log_prob_branches, self.policy.action_masks, self.policy.act_size
)
action_idx = [0] + list(np.cumsum(self.policy.act_size))
self.old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.policy.selected_actions[
:, action_idx[i] : action_idx[i + 1]
],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)
def _create_losses(
self, probs, old_probs, value_heads, entropy, targ_encoder, predict, encoder_distribution,
beta, epsilon, lr, max_step
):
"""
Creates training-specific Tensorflow ops for PPO models.
:param probs: Current policy probabilities
:param old_probs: Past policy probabilities
:param value_heads: Value estimate tensors from each value stream
:param beta: Entropy regularization strength
:param entropy: Current policy entropy
:param epsilon: Value for policy-divergence threshold
:param lr: Learning rate
:param max_step: Total number of training steps.
"""
self.returns_holders = {}
self.old_values = {}
for name in value_heads.keys():
returns_holder = tf.placeholder(
shape=[None], dtype=tf.float32, name="{}_returns".format(name)
)
old_value = tf.placeholder(
shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)
)
self.returns_holders[name] = returns_holder
self.old_values[name] = old_value
self.advantage = tf.placeholder(
shape=[None], dtype=tf.float32, name="advantages"
)
advantage = tf.expand_dims(self.advantage, -1)
self.decay_epsilon = ModelUtils.create_schedule(
self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
)
self.decay_beta = ModelUtils.create_schedule(
self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
)
value_losses = []
for name, head in value_heads.items():
clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
tf.reduce_sum(head, axis=1) - self.old_values[name],
-self.decay_epsilon,
self.decay_epsilon,
)
v_opt_a = tf.squared_difference(
self.returns_holders[name], tf.reduce_sum(head, axis=1)
)
v_opt_b = tf.squared_difference(
self.returns_holders[name], clipped_value_estimate
)
value_loss = tf.reduce_mean(
tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
1
]
)
value_losses.append(value_loss)
self.value_loss = tf.reduce_mean(value_losses)
r_theta = tf.exp(probs - old_probs)
p_opt_a = r_theta * advantage
p_opt_b = (
tf.clip_by_value(
r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
)
* advantage
)
self.policy_loss = -tf.reduce_mean(
tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
)
# For cleaner stats reporting
self.abs_policy_loss = tf.abs(self.policy_loss)
# encoder and predict loss
self.dis_returns = tf.placeholder(
shape=[None], dtype=tf.float32, name="dis_returns"
)
# target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
# if self.predict_return:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target))
# else:
# self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder))
# if self.with_prior:
# if self.use_var_encoder:
# self.model_loss += encoder_distribution.kl_standard()
# if self.use_var_predict:
# self.model_loss += self.policy.predict_distribution.kl_standard()
# if self.use_inverse_model:
# self.model_loss += self.policy.inverse_loss
self.model_loss = 0.2 * self.policy.forward_loss + 0.8 * self.policy.inverse_loss
self.loss = (
self.policy_loss
+ self.model_loss
+ 0.5 * self.value_loss
- self.decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
)
self.ppo_loss = (
self.policy_loss
+ 0.5 * self.value_loss
- self.decay_beta
* tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
)
def _create_ppo_optimizer_ops(self):
if self.use_transfer:
if self.transfer_type == "dynamics":
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += self.policy.get_trainable_variables
print("trainable", train_vars)
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
elif self.transfer_type == "observation":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
else:
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
train_vars += self.policy.get_trainable_variables()
print("trainable", train_vars)
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)
self.update_batch = self.tf_optimizer.minimize(self.loss, var_list=train_vars)
def _init_alter_update(self):
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.ppo_update_dict.update(
{
"value_loss": self.value_loss,
"policy_loss": self.abs_policy_loss,
"update_batch": self.ppo_update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)
self.model_update_dict.update(
{
"model_loss": self.model_loss,
"update_batch": self.model_update_batch,
"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}
)
@timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Performs update on model.
:param mini_batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
feed_dict = self._construct_feed_dict(batch, num_sequences)
stats_needed = self.stats_name_to_update_name
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():
feed_dict.update(
reward_signal.prepare_update(self.policy, batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
if self.use_alter:
if self.num_updates / self.alter_every == 0:
update_vals = self._execute_model(feed_dict, self.update_dict)
if self.num_updates % self.alter_every == 0:
print("start update all", self.num_updates)
elif (self.num_updates / self.alter_every) % 2 == 1:
update_vals = self._execute_model(feed_dict, self.model_update_dict)
if self.num_updates % self.alter_every == 0:
print("start update model", self.num_updates)
else: # (self.num_updates / self.alter_every) % 2 == 0:
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
if self.num_updates % self.alter_every == 0:
print("start update policy", self.num_updates)
elif self.in_batch_alter:
update_vals = self._execute_model(feed_dict, self.model_update_dict)
update_vals.update(self._execute_model(feed_dict, self.ppo_update_dict))
elif self.use_transfer and self.smart_transfer:
if self.update_mode == "model":
update_vals = self._execute_model(feed_dict, self.update_dict)
cur_loss = update_vals["model_loss"]
print("model loss:", cur_loss)
if abs(cur_loss - self.old_loss) < self.conv_thres:
self.update_mode = "policy"
print("start to train policy")
else:
self.old_loss = cur_loss
if self.update_mode == "policy":
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
else:
update_vals = self._execute_model(feed_dict, self.update_dict)
# update target encoder
# if self.num_updates % self.copy_every == 0:
# self.policy.run_hard_copy()
# print("copy")
# self.policy.get_encoder_weights()
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
self.num_updates += 1
return update_stats
def update_part(self, batch: AgentBuffer, num_sequences: int, update_type: str="policy") -> Dict[str, float]:
"""
Performs update on model.
:param mini_batch: Batch of experiences.
:param num_sequences: Number of sequences to process.
:return: Results of update.
"""
feed_dict = self._construct_feed_dict(batch, num_sequences)
if update_type == "model":
stats_needed = {
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
elif update_type == "policy":
stats_needed = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}
update_stats = {}
# Collect feed dicts for all reward signals.
for _, reward_signal in self.reward_signals.items():
feed_dict.update(
reward_signal.prepare_update(self.policy, batch, num_sequences)
)
stats_needed.update(reward_signal.stats_name_to_update_name)
if update_type == "model":
update_vals = self._execute_model(feed_dict, self.model_update_dict)
elif update_type == "policy":
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
# update target encoder
# self.policy.run_hard_copy()
for stat_name, update_name in stats_needed.items():
update_stats[stat_name] = update_vals[update_name]
return update_stats
def _construct_feed_dict(
self, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
burn_in_mask[range(0, num_burn_in)] = 0
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
self.policy.batch_size_ph: num_sequences,
self.policy.sequence_length_ph: self.policy.sequence_length,
self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
self.advantage: mini_batch["advantages"],
self.all_old_log_probs: mini_batch["action_probs"],
# self.policy.processed_vector_next: mini_batch["next_vector_in"],
self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.dis_returns: mini_batch["discounted_returns"]
}
for name in self.reward_signals:
feed_dict[self.returns_holders[name]] = mini_batch[
"{}_returns".format(name)
]
feed_dict[self.old_values[name]] = mini_batch[
"{}_value_estimates".format(name)
]
if self.policy.output_pre is not None and "actions_pre" in mini_batch:
feed_dict[self.policy.output_pre] = mini_batch["actions_pre"]
else:
feed_dict[self.policy.output] = mini_batch["actions"]
if self.policy.use_recurrent:
feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
if "vector_obs" in mini_batch:
feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
if self.policy.vis_obs_size > 0:
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = [
mini_batch["memory"][i]
for i in range(
0, len(mini_batch["memory"]), self.policy.sequence_length
)
]
feed_dict[self.memory_in] = self._make_zero_mem(
self.m_size, mini_batch.num_experiences
)
return feed_dict
def _create_cc_critic_old(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Continuous control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
hidden_stream = ModelUtils.create_observation_streams(
self.policy.visual_in,
self.policy.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
if self.policy.use_recurrent:
hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
hidden_stream,
self.memory_in,
self.policy.sequence_length_ph,
name="lstm_value",
)
self.memory_out = memory_value_out
else:
hidden_value = hidden_stream
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
self.old_log_probs = tf.reduce_sum(
(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
def _create_dc_critic_old(
self, h_size: int, num_layers: int, vis_encode_type: EncoderType
) -> None:
"""
Creates Discrete control critic (value) network.
:param h_size: Size of hidden linear layers.
:param num_layers: Number of hidden linear layers.
:param vis_encode_type: The type of visual encoder to use.
"""
hidden_stream = ModelUtils.create_observation_streams(
self.policy.visual_in,
self.policy.processed_vector_in,
1,
h_size,
num_layers,
vis_encode_type,
)[0]
if self.policy.use_recurrent:
hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
hidden_stream,
self.memory_in,
self.policy.sequence_length_ph,
name="lstm_value",
)
self.memory_out = memory_value_out
else:
hidden_value = hidden_stream
self.value_heads, self.value = ModelUtils.create_value_heads(
self.stream_names, hidden_value
)
self.all_old_log_probs = tf.placeholder(
shape=[None, sum(self.policy.act_size)],
dtype=tf.float32,
name="old_probabilities",
)
# Break old log probs into separate branches
old_log_prob_branches = ModelUtils.break_into_branches(
self.all_old_log_probs, self.policy.act_size
)
_, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
old_log_prob_branches, self.policy.action_masks, self.policy.act_size
)
action_idx = [0] + list(np.cumsum(self.policy.act_size))
self.old_log_probs = tf.reduce_sum(
(
tf.stack(
[
-tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self.policy.selected_actions[
:, action_idx[i] : action_idx[i + 1]
],
logits=old_normalized_logits[
:, action_idx[i] : action_idx[i + 1]
],
)
for i in range(len(self.policy.act_size))
],
axis=1,
)
),
axis=1,
keepdims=True,
)

350
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


# # Unity ML-Agents Toolkit
# ## ML-Agent Learning (PPO)
# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
from collections import defaultdict
from typing import cast
import numpy as np
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.policy.nn_policy import NNPolicy
from mlagents.trainers.policy.transfer_policy import TransferPolicy
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.ppo_transfer.optimizer import PPOTransferOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, PPOSettings
BUFFER_TRUNCATE_PERCENT = 0.6
logger = get_logger(__name__)
class PPOTransferTrainer(RLTrainer):
"""The PPOTrainer is an implementation of the PPO algorithm."""
def __init__(
self,
brain_name: str,
reward_buff_cap: int,
trainer_settings: TrainerSettings,
training: bool,
load: bool,
seed: int,
artifact_path: str,
):
"""
Responsible for collecting experiences and training PPO model.
:param brain_name: The name of the brain associated with trainer config
:param reward_buff_cap: Max reward history to track in the reward buffer
:param trainer_settings: The parameters for the trainer.
:param training: Whether the trainer is set for training.
:param load: Whether the model should be loaded.
:param seed: The seed the model will be initialized with
:param artifact_path: The directory within which to store artifacts from this trainer.
"""
super(PPOTransferTrainer, self).__init__(
brain_name, trainer_settings, training, artifact_path, reward_buff_cap
)
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
)
self.load = load
self.seed = seed
self.policy: TransferPolicy = None # type: ignore
self.off_policy_buffer: AgentBuffer = AgentBuffer()
self.use_iealter = False
print("The current algorithm is PPO Transfer")
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""
Takes a trajectory and processes it, putting it into the update buffer.
Processing involves calculating value and advantage targets for model updating step.
:param trajectory: The Trajectory tuple containing the steps to be processed.
"""
super()._process_trajectory(trajectory)
agent_id = trajectory.agent_id # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Get all value estimates
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
trajectory.done_reached and not trajectory.interrupted,
)
for name, v in value_estimates.items():
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
agent_buffer_trajectory["{}_rewards".format(name)].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
# Compute GAE and returns
tmp_advantages = []
tmp_returns = []
for name in self.optimizer.reward_signals:
bootstrap_value = value_next[name]
local_rewards = agent_buffer_trajectory[
"{}_rewards".format(name)
].get_batch()
local_value_estimates = agent_buffer_trajectory[
"{}_value_estimates".format(name)
].get_batch()
local_advantage = get_gae(
rewards=local_rewards,
value_estimates=local_value_estimates,
value_next=bootstrap_value,
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)
local_return = local_advantage + local_value_estimates
# This is later use as target for the different value estimates
agent_buffer_trajectory["{}_returns".format(name)].set(local_return)
agent_buffer_trajectory["{}_advantage".format(name)].set(local_advantage)
tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)
# Get global advantages
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
agent_buffer_trajectory["discounted_returns"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length
)
# the off-policy buffer
if self.use_iealter:
agent_buffer_trajectory.resequence_and_append(
self.off_policy_buffer, training_length=self.policy.sequence_length
)
# If this was a terminal trajectory, append stats and reset reward collection
if trajectory.done_reached:
self._update_end_episode_stats(agent_id, self.optimizer)
def _is_ready_update(self):
"""
Returns whether or not the trainer has enough elements to run update model
:return: A boolean corresponding to whether or not update_model() can be run
"""
if self.use_iealter:
size_of_buffer = self.off_policy_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
else:
size_of_buffer = self.update_buffer.num_experiences
return size_of_buffer > self.hyperparameters.buffer_size
def _update_policy(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
if self.use_iealter:
self._update_model()
if self.update_buffer.num_experiences < self.hyperparameters.buffer_size:
return True
buffer_length = self.update_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
return True
def _update_model(self):
"""
Uses demonstration_buffer to update the policy.
The reward signal generators must be updated in this method at their own pace.
"""
buffer_length = self.off_policy_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()
# Make sure batch_size is a multiple of sequence length. During training, we
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
)
advantages = self.off_policy_buffer["advantages"].get_batch()
self.off_policy_buffer["advantages"].set(
(advantages - advantages.mean()) / (advantages.std() + 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
# self.off_policy_buffer.reset_agent()
if self.off_policy_buffer.num_experiences > self.hyperparameters.buffer_size:
self.off_policy_buffer.truncate(
int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
)
return True
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters
) -> TFPolicy:
"""
Creates a PPO policy to trainers list of policies.
:param brain_parameters: specifications for policy construction
:return policy
"""
policy = TransferPolicy(
self.seed,
brain_parameters,
self.trainer_settings,
self.is_training,
self.artifact_path,
self.load,
condition_sigma_on_obs=False, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
)
return policy
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
) -> None:
"""
Adds policy to trainer.
:param parsed_behavior_id: Behavior identifiers that the policy should belong to.
:param policy: Policy to associate with name_behavior_id.
"""
if self.policy:
logger.warning(
"Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
train adversarial games.".format(
self.__class__.__name__
)
)
if not isinstance(policy, TransferPolicy):
raise RuntimeError("Non-NNPolicy passed to PPOTrainer.add_policy()")
self.policy = policy
self.optimizer = PPOTransferOptimizer(self.policy, self.trainer_settings)
for _reward_signal in self.optimizer.reward_signals.keys():
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
# Needed to resume loads properly
self.step = policy.get_current_step()
def get_policy(self, name_behavior_id: str) -> TFPolicy:
"""
Gets policy from trainer associated with name_behavior_id
:param name_behavior_id: full identifier of policy
"""
return self.policy
def discount_rewards(r, gamma=0.99, value_next=0.0):
"""
Computes discounted sum of future rewards for use in updating value estimate.
:param r: List of rewards.
:param gamma: Discount factor.
:param value_next: T+1 value estimate for returns calculation.
:return: discounted sum of future rewards as list.
"""
discounted_r = np.zeros_like(r)
running_add = value_next
for t in reversed(range(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
:param value_next: Value estimate for time-step T+1.
:param value_estimates: list of value estimates for time-steps t to T.
:param gamma: Discount factor.
:param lambd: GAE weighing factor.
:return: list of advantage estimates for time-steps t to T.
"""
value_estimates = np.append(value_estimates, value_next)
delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage

14
Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial


%YAML 1.1
%TAG !u! tag:unity3d.com,2011:
--- !u!134 &13400000
PhysicMaterial:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_Name: Floor
dynamicFriction: 0.6
staticFriction: 0.6
bounciness: 0
frictionCombine: 0
bounceCombine: 0

8
Project/Assets/ML-Agents/Examples/Crawler/Physics_materials/Floor.physicMaterial.meta


fileFormatVersion: 2
guid: dff6e5680d76643a481e8d81555ef3ee
NativeFormatImporter:
externalObjects: {}
mainObjectFileID: 13400000
userData:
assetBundleName:
assetBundleVariant:
正在加载...
取消
保存