浏览代码

update with new alternating

/develop/bisim-review
yanchaosun 4 年前
当前提交
aca8cd58
共有 4 个文件被更改,包括 51 次插入34 次删除
  1. 20
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  2. 32
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  3. 31
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  4. 2
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

20
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


from typing import Optional, Any, Dict, cast
import numpy as np
import os
import copy
from mlagents.tf_utils import tf
from mlagents_envs.timers import timed
from mlagents.trainers.models import ModelUtils, EncoderType, ScheduleType

hyperparameters: PPOTransferSettings = cast(
PPOTransferSettings, trainer_params.hyperparameters
)
self.batch_size = hyperparameters.batch_size
self.separate_value_train = hyperparameters.separate_value_train
self.separate_policy_train = hyperparameters.separate_policy_train

min_value=1e-10,
)
self.model_learning_rate = ModelUtils.create_schedule(
ScheduleType.LINEAR,
# ScheduleType.CONSTANT,
# ScheduleType.LINEAR,
ScheduleType.CONSTANT,
lr,
self.policy.global_step,
int(max_step),

ScheduleType.LINEAR,
ScheduleType.CONSTANT,
lr/10,
self.policy.global_step,
int(max_step),

print("start update policy", self.num_updates)
elif self.in_batch_alter:
update_vals = self._execute_model(feed_dict, self.model_update_dict)
update_vals.update(self._execute_model(feed_dict, self.ppo_update_dict))
update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
if self.use_bisim:
batch1 = copy.deepcopy(batch)
batch.shuffle(sequence_length=self.batch_size)
batch2 = copy.deepcopy(batch)
bisim_stats = self.update_encoder(batch1, batch2)
update_vals.update(self._execute_model(feed_dict, self.model_update_dict))
elif self.use_transfer and self.smart_transfer:
if self.update_mode == "model":
update_vals = self._execute_model(feed_dict, self.update_dict)

for stat_name, update_name in stats_needed.items():
# if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
if self.in_batch_alter and self.use_bisim:
update_stats.update(bisim_stats)
self.num_updates += 1
return update_stats

32
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


self.update_buffer, training_length=self.policy.sequence_length
)
# the off-policy buffer
if self.use_op_buffer:
if self.use_op_buffer and self.train_model:
agent_buffer_trajectory.resequence_and_append(
self.off_policy_buffer, training_length=self.policy.sequence_length
)

for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:
if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
self.train_model = False
else:
self.old_loss = np.mean(stat_list)
# if abs(self.old_loss - np.mean(stat_list)) < 1e-3:
# self.train_model = False
# else:
# self.old_loss = np.mean(stat_list)
# if self.num_update >= 10:
# self.train_model = False
print(stat, np.mean(stat_list))
if self.optimizer.bc_module:

# self.off_policy_buffer.reset_agent()
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
print("truncate")
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
print("truncate")
# self.train_model = False
return True

The reward signal generators must be updated in this method at their own pace.
"""
if self.train_model and self.use_op_buffer:
self._update_model()
update_buffer_length = self.update_buffer.num_experiences
op_buffer_length = self.off_policy_buffer.num_experiences
self.cumulative_returns_since_policy_update.clear()

for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(

batch_update_stats[stat_name].append(value)
if self.use_bisim:
# for _ in range(num_epoch):
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.off_policy_buffer)
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.off_policy_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.update_buffer)
max_num_batch = update_buffer_length // batch_size # update with as much data as the policy has
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(

31
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3,
use_bisim=True, predict_return=True,
# separate_value_train=True, separate_policy_train=True,
use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False,
policy_layers=0, value_layers=0, encoder_layers=2, feature_size=4,
use_var_predict=True, with_prior=True, use_op_buffer=False, in_epoch_alter=False, in_batch_alter=True,
policy_layers=1, value_layers=1, encoder_layers=1, feature_size=4,
#use_inverse_model=True
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)

)
new_hyperparams = attr.evolve(
config.hyperparameters, batch_size=120, buffer_size=12000, use_transfer=True,
transfer_path=transfer_from,
use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False, learning_rate=5.0e-3,
transfer_path=transfer_from, separate_policy_train=True,
use_op_buffer=True, in_epoch_alter=False, in_batch_alter=True, learning_rate=5.0e-3,
use_var_predict=True, with_prior=True, policy_layers=1, load_policy=False,
load_value=False, predict_return=True, value_layers=1, encoder_layers=1,
use_var_predict=True, with_prior=True, policy_layers=0, load_policy=False,
load_value=False, predict_return=True, value_layers=0, encoder_layers=2,
use_bisim=True,
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=300000, summary_freq=5000)

if __name__ == "__main__":
for obs in ["rich1", "rich2"]: # ["normal", "rich1", "rich2"]:
for obs in ["normal", "rich1", "rich2"]: # ["normal", "rich1", "rich2"]:
+ "_f4_pv-l1_rew_bisim-op_newalter_noreuse-soft0.1")
+ "_f4_pv-l0_rew_bisim_order-ibalter_noreuse-soft0.1_noop_conlr")
# for obs in ["normal", "rich1"]:
# test_2d_transfer(seed=0, obs_spec_type="rich2",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-op_samelen_s0/Simple",
# run_id="transfer_rich2_f4_pv-l0_rew_bisim-op_samelen_from_" + obs)
# for obs in ["normal", "rich2"]:
# for obs in ["normal"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim_order-ibalter_noreuse-soft0.1_nostop-op10_linlr_s0/Simple",
# run_id="transfer_rich1_f4_pv-l0_soft_ibalter_sepp_from_" + obs)
# for obs in ["normal"]:
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-op_new_s0/Simple",
# run_id="transfer_rich1_f4_pv-l0_rew_bisim-op_new_from" + obs)
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim-nop_newalter_noreuse-soft0.1_s0/Simple",
# run_id="transfer_rich1_retrain-all_f4_pv-l0_rew_bisim-nop_noreuse-soft0.1_from_" + obs)
# for i in range(5):
# test_2d_model(seed=i)

2
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


# reward += np.exp(-abs(_pos - self.goal[name]))
if done:
reward = SUCCESS_REWARD
reward = TIME_PENALTY #SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(

正在加载...
取消
保存