浏览代码

transfer config

/develop/bisim-review
yanchaosun 4 年前
当前提交
59e93b0b
共有 7 个文件被更改,包括 33 次插入18 次删除
  1. 5
      config/ppo_transfer/CrawlerStatic.yaml
  2. 5
      config/ppo_transfer/CrawlerStaticOpbuffer.yaml
  3. 2
      config/ppo_transfer/CrawlerStaticVar.yaml
  4. 7
      config/ppo_transfer/OldCrawlerStatic.yaml
  5. 6
      ml-agents/mlagents/trainers/learn.py
  6. 20
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  7. 6
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py

5
config/ppo_transfer/CrawlerStatic.yaml


value_layers: 2
feature_size: 32
reuse_encoder: true
use_var_encoder: true
use_var_predict: true
in_epoch_alter: true
use_inverse_model: true
use_op_buffer: true
network_settings:
normalize: true
hidden_units: 512

5
config/ppo_transfer/CrawlerStaticOpbuffer.yaml


inverse_layers: 1
feature_size: 32
reuse_encoder: true
use_var_predict: true
in_epoch_alter: true
in_batch_alter: true
in_batch_alter: true
use_var_predict: true
network_settings:
normalize: true
hidden_units: 512

2
config/ppo_transfer/CrawlerStaticVar.yaml


behaviors:
OldCrawlerStatic:
CrawlerStatic:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 2024

7
config/ppo_transfer/OldCrawlerStatic.yaml


value_layers: 2
feature_size: 32
reuse_encoder: true
in_batch_alter: true
use_var_encoder: true
use_var_predict: true
in_epoch_alter: true
in_batch_alter: true
use_op_buffer: true
network_settings:
normalize: true
hidden_units: 512

time_horizon: 1000
summary_freq: 30000
threaded: true
transfer: true

6
ml-agents/mlagents/trainers/learn.py


run_seed = np.random.randint(0, 10000)
run_training(run_seed, options)
# if options.behaviors["3DBall"].transfer:
# os.system('export SCENE_NAME=3dballhard')
# os.system('mlagents-learn config/ppo_transfer/3DBallHard.yaml --run-id=hardball-transfer --env=/unity-volume/executable --num-envs=4 --force')
if options.behaviors["CrawlerStatic"].transfer:
os.system('export SCENE_NAME=crawlerstatictarget')
os.system('mlagents-learn config/ppo_transfer/CrawlerStatic.yaml --run-id=cs-transfer --env=/unity-volume/executable --num-envs=4 --force')
# os.system('mlagents-learn config/ppo_transfer/3DBallHard.yaml --run-id=hardball-transfer --env=/unity-volume/3dballhard --num-envs=4 --force')

20
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


import os
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.transfer_policy import TransferPolicy

"Losses/Policy Loss": "policy_loss",
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Model Learning Rate": "model_learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}

with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(
self._schedule,
lr,
self.policy.global_step,
int(max_step),
min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
lr,
self.policy.global_step,
int(max_step),

"learning_rate": self.learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
"model_learning_rate": self.model_learning_rate
}
)

+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)

{
"model_loss": self.model_loss,
"update_batch": self.model_update_batch,
"learning_rate": self.learning_rate,
"model_learning_rate": self.model_learning_rate,
"decay_epsilon": self.decay_epsilon,
"decay_beta": self.decay_beta,
}

if update_type == "model":
stats_needed = {
"Losses/Model Loss": "model_loss",
"Policy/Learning Rate": "learning_rate",
"Policy/Learning Rate": "model_learning_rate",
"Policy/Epsilon": "decay_epsilon",
"Policy/Beta": "decay_beta",
}

self.policy.processed_vector_next: mini_batch["next_vector_in"],
# self.policy.next_vector_in: mini_batch["next_vector_in"],
self.policy.current_action: mini_batch["actions"],
self.policy.current_reward: mini_batch["extrinsic_rewards"],
self.policy.current_reward: mini_batch["discounted_returns"],
# self.dis_returns: mini_batch["discounted_returns"]
}
for name in self.reward_signals:

6
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
# self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):

正在加载...
取消
保存