浏览代码

bisimulation draft

/develop/bisim-review
yanchaosun 4 年前
当前提交
3d0d359c
共有 9 个文件被更改,包括 558 次插入19 次删除
  1. 7
      config/ppo_transfer/CrawlerStaticOpbuffer.yaml
  2. 6
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  3. 22
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  4. 18
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  5. 412
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  6. 25
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  7. 8
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py
  8. 37
      config/ppo_transfer/CrawlerStaticP0.yaml
  9. 42
      config/ppo_transfer/TransferCrawlerStaticP0.yaml

7
config/ppo_transfer/CrawlerStaticOpbuffer.yaml


value_layers: 3
forward_layers: 2
inverse_layers: 1
feature_size: 32
feature_size: 64
use_bisim: true
separate_value_net: true
use_var_predict: true
with_prior: true
predict_return: true
network_settings:
normalize: true
hidden_units: 512

6
ml-agents/mlagents/trainers/policy/transfer_policy.py


kl = 0.5 * tf.reduce_sum(tf.square(self.mu) + tf.square(self.sigma) - 2 * self.log_sigma - 1, 1)
return kl
def w_distance(self, another):
return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(self.sigma, another.sigma)
class TransferPolicy(TFPolicy):
def __init__(

if var_predict:
self.bisim_predict_distribution = GaussianEncoderDistribution(
hidden,
self.feature_size
self.feature_size,
reuse=True
)
self.bisim_predict = self.predict_distribution.sample()
else:

22
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


reward_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
)
predict_diff = self.reward_signals["extrinsic_value"].gamma * predict_diff + reward_diff
predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + reward_diff
encode_dist = tf.reduce_mean(
tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
if self.train_policy:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
if self.train_value:

train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "reward")
self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)

stats_needed = {
"Losses/Bisim Loss": "bisim_loss",
"Policy/Bisim Learning Rate": "bisim learning_rate",
"Policy/Bisim Learning Rate": "bisim_learning_rate",
self.policy.vector_in: mini_batch1["vector_in"],
self.policy.vector_bisim: mini_batch2["vector_in"],
self.policy.vector_in: mini_batch1["vector_obs"],
self.policy.vector_bisim: mini_batch2["vector_obs"],
# print("batch 1", mini_batch1["vector_obs"])
# print("batch 2", mini_batch2["vector_obs"])
# print("batch 3", mini_batch3["vector_obs"])
return update_stats
def _construct_feed_dict(

18
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


from typing import cast
import numpy as np
import copy
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.policy.nn_policy import NNPolicy

self.use_iealter = self.hyperparameters.in_epoch_alter
self.use_op_buffer = self.hyperparameters.use_op_buffer
self.conv_thres = self.hyperparameters.conv_thres
self.use_bisim = self.hyperparameters.use_bisim
self.num_check = 0
self.train_model = True
self.old_loss = np.inf

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer3 = copy.deepcopy(self.update_buffer)
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
buffer3.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

412
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

25
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
network_settings=NetworkSettings(num_layers=2, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,

# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="model_rich2_f4", seed=0):
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="model_rich2_f4_pv-l0", seed=0):
config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3
config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3,
# use_bisim=False, predict_return=True, separate_value_train=True, separate_policy_train=True,
# use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False,
# policy_layers=0, value_layers=0, encoder_layers=2, feature_size=4,
#use_inverse_model=True
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_rich2_from-normal", seed=1337):
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_f4_rich2_from-rich1-retrain-pv", seed=1337):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"

transfer_path="./transfer_results/model_normal_f4_s0/Simple",
use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False, learning_rate=5e-4,
train_policy=False, train_value=False, train_model=False, feature_size=4
transfer_path="./transfer_results/model_rich1_f4_pv-l0_s0/Simple",
use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False, learning_rate=5.0e-3,
train_policy=True, train_value=True, train_model=False, feature_size=4,
use_var_predict=True, with_prior=True, policy_layers=0, load_policy=False,
load_value=False,
value_layers=0, encoder_layers=2,
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)

# test_2d_model(seed=0)
# test_2d_model(config=PPO_CONFIG, run_id="ppo_rich2", seed=0)
test_2d_transfer(seed=0)
test_2d_model(config=SAC_CONFIG, run_id="sac_rich2_hard", seed=0)
# test_2d_transfer(seed=0)
# for i in range(5):
# test_2d_model(seed=i)

8
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


for name in self.names:
self.agent_id[name] = 0
if self.goal_type == "easy":
self.goal[name] = self.random.choice([-1, 1])
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.choice([-1, 1]))
elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):

def _reset_agent(self, name):
if self.goal_type == "easy":
self.goal[name] = self.random.choice([-1, 1])
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.choice([-1, 1]))
elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):

37
config/ppo_transfer/CrawlerStaticP0.yaml


behaviors:
CrawlerStatic:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 3
policy_layers: 0
forward_layers: 2
value_layers: 3
feature_size: 32
reuse_encoder: true
separate_value_train: true
in_epoch_alter: true
use_op_buffer: true
use_var_predict: true
with_prior: true
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 1000
summary_freq: 30000
threaded: true

42
config/ppo_transfer/TransferCrawlerStaticP0.yaml


behaviors:
CrawlerStatic:
trainer_type: ppo_transfer
hyperparameters:
batch_size: 2024
buffer_size: 20240
learning_rate: 0.0003
beta: 0.005
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
encoder_layers: 3
policy_layers: 0
forward_layers: 2
value_layers: 3
feature_size: 32
reuse_encoder: true
separate_value_train: true
in_epoch_alter: true
use_op_buffer: true
use_var_predict: true
with_prior: true
use_transfer: true
load_policy: false
load_value: false
transfer_path: "results/cs-p0/CrawlerStatic"
network_settings:
normalize: true
hidden_units: 512
num_layers: 3
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.995
strength: 1.0
keep_checkpoints: 5
max_steps: 10000000
time_horizon: 1000
summary_freq: 30000
threaded: true
transfer: true
正在加载...
取消
保存