浏览代码

ppo new implementation

/develop/bisim-sac-transfer
yanchaosun 5 年前
当前提交
e8fcc4bb
共有 8 个文件被更改,包括 60 次插入58 次删除
  1. 1
      config/ppo_transfer/3DBall.yaml
  2. 1
      config/ppo_transfer/3DBallHard.yaml
  3. 6
      config/ppo_transfer/3DBallHardTransfer.yaml
  4. 4
      config/sac/3DBallHard.yaml
  5. 2
      config/sac_transfer/3DBall.yaml
  6. 2
      config/sac_transfer/3DBallHard.yaml
  7. 4
      config/sac_transfer/3DBallHardTransfer.yaml
  8. 98
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py

1
config/ppo_transfer/3DBall.yaml


num_epoch: 3
learning_rate_schedule: linear
model_schedule: constant
separate_model_train: true
encoder_layers: 1
policy_layers: 1
forward_layers: 1

1
config/ppo_transfer/3DBallHard.yaml


num_epoch: 3
learning_rate_schedule: linear
model_schedule: constant
separate_model_train: true
encoder_layers: 1
policy_layers: 1
forward_layers: 1

6
config/ppo_transfer/3DBallHardTransfer.yaml


lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
model_schedule: constant
model_schedule: linear
encoder_layers: 1
policy_layers: 1
forward_layers: 1

in_epoch_alter: false
in_batch_alter: true
in_batch_alter: false
use_op_buffer: false
use_var_predict: true
with_prior: false

use_transfer: true
transfer_path: "results/ball-targv/3DBall"
transfer_path: "results/ball/3DBall"
load_model: true
train_model: false
network_settings:

4
config/sac/3DBallHard.yaml


trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
learning_rate_schedule: linear
batch_size: 256
buffer_size: 500000
buffer_init_steps: 0

reward_signal_steps_per_update: 10.0
network_settings:
normalize: true
hidden_units: 64
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:

2
config/sac_transfer/3DBall.yaml


use_bisim: false
network_settings:
normalize: true
hidden_units: 64
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:

2
config/sac_transfer/3DBallHard.yaml


use_bisim: false
network_settings:
normalize: true
hidden_units: 64
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:

4
config/sac_transfer/3DBallHardTransfer.yaml


use_transfer: true
load_model: true
train_model: false
transfer_path: "results/ball-linear/3DBall"
transfer_path: "results/ball_f16_linear/3DBall"
hidden_units: 64
hidden_units: 128
num_layers: 2
vis_encode_type: simple
reward_signals:

98
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


(tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
)
target_hidden_value = ModelUtils.create_vector_observation_encoder(
self.policy.targ_encoder,
h_size,
ModelUtils.swish,
num_layers,
scope=f"main_graph",
reuse=True,
)
self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
self.stream_names, target_hidden_value, reuse=True
)
# target_hidden_value = ModelUtils.create_vector_observation_encoder(
# self.policy.targ_encoder,
# h_size,
# ModelUtils.swish,
# num_layers,
# scope=f"main_graph",
# reuse=True,
# )
# self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
# self.stream_names, target_hidden_value, reuse=True
# )
def _create_dc_critic(

keepdims=True,
)
def _get_value_estimates(
self,
next_obs: List[np.ndarray],
done: bool,
policy_memory: np.ndarray = None,
value_memory: np.ndarray = None,
prev_action: np.ndarray = None,
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param experience: AgentExperience to be used for bootstrapping.
:param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal and the value the
corresponding value estimate.
"""
# def _get_value_estimates(
# self,
# next_obs: List[np.ndarray],
# done: bool,
# policy_memory: np.ndarray = None,
# value_memory: np.ndarray = None,
# prev_action: np.ndarray = None,
# ) -> Dict[str, float]:
# """
# Generates value estimates for bootstrapping.
# :param experience: AgentExperience to be used for bootstrapping.
# :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
# :return: The value estimate dictionary with key being the name of the reward signal and the value the
# corresponding value estimate.
# """
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: 1,
self.policy.sequence_length_ph: 1,
}
vec_vis_obs = SplitObservations.from_observations(next_obs)
for i in range(len(vec_vis_obs.visual_observations)):
feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
# feed_dict: Dict[tf.Tensor, Any] = {
# self.policy.batch_size_ph: 1,
# self.policy.sequence_length_ph: 1,
# }
# vec_vis_obs = SplitObservations.from_observations(next_obs)
# for i in range(len(vec_vis_obs.visual_observations)):
# feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
if self.policy.vec_obs_size > 0:
feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
if policy_memory is not None:
feed_dict[self.policy.memory_in] = policy_memory
if value_memory is not None:
feed_dict[self.memory_in] = value_memory
if prev_action is not None:
feed_dict[self.policy.prev_action] = [prev_action]
value_estimates = self.sess.run(self.target_value_heads, feed_dict)
# if self.policy.vec_obs_size > 0:
# feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
# if policy_memory is not None:
# feed_dict[self.policy.memory_in] = policy_memory
# if value_memory is not None:
# feed_dict[self.memory_in] = value_memory
# if prev_action is not None:
# feed_dict[self.policy.prev_action] = [prev_action]
# value_estimates = self.sess.run(self.target_value_heads, feed_dict)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
# # If we're done, reassign all of the value estimates that need terminal states.
# if done:
# for k in value_estimates:
# if self.reward_signals[k].use_terminal_states:
# value_estimates[k] = 0.0
return value_estimates
# return value_estimates
正在加载...
取消
保存