浏览代码

some changes

/develop/transfer-bisim
yanchaosun 4 年前
当前提交
ce36349b
共有 5 个文件被更改,包括 1050 次插入66 次删除
  1. 39
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 18
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 57
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 964
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  5. 38
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

39
ml-agents/mlagents/trainers/policy/transfer_policy.py


# a stable version, used to compute the value
self.target_encoder = tf.stop_gradient(self.target_encoder)
self._create_hard_copy()
self._create_soft_copy()
if self.inverse_model:
with tf.variable_scope("inverse"):

if predict_return:
with tf.variable_scope("reward"):
self.create_reward_model(self.encoder, self.next_encoder, forward_layers)
self.create_reward_model(self.encoder, self.next_encoder, forward_layers-1)
self.vis_encode_type, forward_layers, var_predict, predict_return)
self.vis_encode_type, forward_layers, forward_layers-1, var_predict, predict_return)
if self.use_continuous_act:
self._create_cc_actor(

def load_graph_partial(self, path: str, transfer_type="dynamics", load_model=True, load_policy=True,
load_value=True):
load_nets = {"dynamics": [],
"observation": ["encoding", "inverse"]}
"observation": ["encoding"]}
if load_model:
load_nets["dynamics"].append("predict")
if self.predict_return:

load_nets["dynamics"].append("value")
if self.inverse_model:
load_nets["dynamics"].append("inverse")
load_nets["observation"].append("inverse")
with self.graph.as_default():
for net in load_nets[transfer_type]:

latent = latent_distribution.sample()
return latent_distribution, latent
def _create_soft_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoding')
with tf.variable_scope('hard_replacement'):
self.soft_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
def run_soft_copy(self):
# print("before:")
# self.get_encoder_weights()
self.sess.run(self.soft_replace_op)
def _create_hard_copy(self):
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_enc')

self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
self.hard_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess.run(self.target_replace_op)
self.sess.run(self.hard_replace_op)
def _create_inverse_model(
self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor

self.h_size,
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if var_predict:

for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
self.h_size,
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.pred_reward = tf.layers.dense(
hidden,

encoder_layers: int,
vis_encode_type: EncoderType,
forward_layers: int,
reward_layers: int,
var_predict: bool,
predict_return: bool
) -> None:

if predict_return:
with tf.variable_scope("reward"):
hidden = combined_input
for i in range(forward_layers):
for i in range(reward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size

18
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.model_loss = self.policy.forward_loss
if self.predict_return:
self.model_loss += 0.5 * self.policy.reward_loss
self.model_loss += self.policy.reward_loss
if self.with_prior:
if self.use_var_encoder:
self.model_loss += 0.2 * self.policy.encoder_distribution.kl_standard()

predict_diff = 0.99 * predict_diff + tf.abs(reward_diff)
encode_dist = tf.reduce_mean(
tf.abs(self.policy.encoder - self.policy.bisim_encoder)
# tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)
self.encode_dist_val = encode_dist
self.predict_diff_val = predict_diff

# update target encoder
if self.num_updates % self.copy_every == 0:
self.policy.run_hard_copy()
self.policy.run_soft_copy()
self.run_soft_critic_copy()
# print("copy")
# self.policy.get_encoder_weights()

if update_type == "model":
update_vals = self._execute_model(feed_dict, self.model_update_dict)
print("forward loss:", self.policy.sess.run(self.policy.forward_loss, feed_dict=feed_dict), end=" ")
# print("reward loss:", self.policy.sess.run(self.policy.reward_loss, feed_dict=feed_dict))
# print("true reward:", self.policy.sess.run(self.policy.current_reward, feed_dict=feed_dict))
# print("predict reward:", self.policy.sess.run(self.policy.pred_reward, feed_dict=feed_dict))
# print("reward loss:", self.policy.sess.run(self.policy.reward_loss, feed_dict=feed_dict))
print("model only forward loss:", self.policy.sess.run(self.policy.forward_loss, feed_dict=feed_dict), end=" ")
# print("model only reward loss:", self.policy.sess.run(self.policy.reward_loss, feed_dict=feed_dict))
# print("true reward:", self.policy.sess.run(self.policy.current_reward, feed_dict=feed_dict))
# print("predict reward:", self.policy.sess.run(self.policy.pred_reward, feed_dict=feed_dict))
# print("reward loss:", self.policy.sess.run(self.policy.reward_loss, feed_dict=feed_dict))
self.policy.run_hard_copy()
self.policy.run_soft_copy()
self.run_soft_critic_copy()
# print("copy")
# self.policy.get_encoder_weights()

57
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
# print("update epoch")
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# if self.train_model:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

buffer.make_mini_batch(i, i + batch_size), n_sequences, "model"
buffer.make_mini_batch(i, i + batch_size), n_sequences, "model_only"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
if self.use_bisim:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer1 = copy.deepcopy(self.update_buffer)
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer2 = copy.deepcopy(self.update_buffer)
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update_encoder(
buffer1.make_mini_batch(i, i + batch_size),
buffer2.make_mini_batch(i, i + batch_size),
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# if self.use_bisim:
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer1 = copy.deepcopy(self.update_buffer)
# self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer2 = copy.deepcopy(self.update_buffer)
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_encoder(
# buffer1.make_mini_batch(i, i + batch_size),
# buffer2.make_mini_batch(i, i + batch_size),
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
else:
self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.update_buffer

num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
for _ in range(num_epoch):
# print("model epoch")
self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
buffer = self.off_policy_buffer
max_num_batch = buffer_length // batch_size

)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
# self.off_policy_buffer.shuffle(sequence_length=self.policy.sequence_length)
# buffer = self.off_policy_buffer
# max_num_batch = buffer_length // batch_size
# for i in range(0, max_num_batch * batch_size, batch_size):
# update_stats = self.optimizer.update_part(
# buffer.make_mini_batch(i, i + batch_size), n_sequences, "model_only"
# )
# for stat_name, value in update_stats.items():
# batch_update_stats[stat_name].append(value)
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
if stat == "Losses/Model Loss": # and np.mean(stat_list) < 0.01:

964
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

38
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


action_size=1,
obs_spec_type="normal", # normal: (x,y); rich: (x+y, x-y, x*y)
goal_type="hard", # easy: 1 or -1; hard: uniformly random
act_speed=1
):
super().__init__()
self.discrete = use_discrete

self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self.agent_id: Dict[str, int] = {}
self.step_size = step_size # defines the difficulty of the test
self.act_speed = act_speed
for name in self.names:
self.agent_id[name] = 0

self.action[name] = None
self.step_result[name] = None
self.step_count[name] = 0
self.horizon[name] = 5000
self.horizon[name] = 1000
print(self.goal)
def _make_obs_spec(self) -> List[Any]:

def _compute_reward(self, name: str, done: bool) -> float:
reward = 0.0
# for _pos, goal in zip(self.positions[name], self.goal[name]):
# # if abs(_pos - self.goal[name]) < 0.1:
# # reward += SUCCESS_REWARD
# # else:
# # reward -= TIME_PENALTY
# reward -= abs(_pos - goal) #np.exp(-abs(_pos - goal))
for _pos, goal in zip(self.positions[name], self.goal[name]):
# if abs(_pos - self.goal[name]) < 0.1:
# reward += SUCCESS_REWARD
# else:
# reward -= TIME_PENALTY
reward -= abs(_pos - goal) / 10 #np.exp(-abs(_pos - goal))
if done and self.step_count[name] < self.horizon[name]:
reward = SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# self.positions[name]
# )
# elif self.goal_type == "hard":
# reward += np.exp(-abs(_pos - self.goal[name]))
else:
reward = -TIME_PENALTY
# if done and self.step_count[name] < self.horizon[name]:
# reward = SUCCESS_REWARD
# # for _pos in self.positions[name]:
# # if self.goal_type == "easy":
# # reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# # self.positions[name]
# # )
# # elif self.goal_type == "hard":
# # reward += np.exp(-abs(_pos - self.goal[name]))
# else:
# reward = -TIME_PENALTY
return reward

正在加载...
取消
保存