浏览代码

bisim fix, disable stop gradient

/develop/transfer-bisim
yanchaosun 4 年前
当前提交
28355444
共有 3 个文件被更改,包括 37 次插入36 次删除
  1. 8
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 29
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 36
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

8
ml-agents/mlagents/trainers/policy/transfer_policy.py


return kl
def w_distance(self, another):
return tf.reduce_sum(tf.squared_difference(self.mu, another.mu))\
+ tf.reduce_sum(tf.squared_difference(self.sigma, another.sigma))
return tf.sqrt(
tf.reduce_sum(tf.squared_difference(self.mu, another.mu), axis=1)\
+ tf.reduce_sum(tf.squared_difference(self.sigma, another.sigma), axis=1)
)
class TransferPolicy(TFPolicy):

self.vis_encode_type
)
# used to encode the next state
self.next_encoder = tf.stop_gradient(self.next_encoder)
# self.next_encoder = tf.stop_gradient(self.next_encoder)
# a stable version, used to compute the value
self.target_encoder = tf.stop_gradient(self.target_encoder)
self._create_hard_copy()

29
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


if self.use_bisim:
if self.use_var_predict:
predict_diff = self.policy.predict_distribution.w_distance(self.policy.bisim_predict_distribution)
self.predict_diff = self.policy.predict_distribution.w_distance(self.policy.bisim_predict_distribution)
predict_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_predict, self.policy.predict)
self.predict_diff = tf.reduce_sum(
tf.squared_difference(self.policy.bisim_predict, self.policy.predict), axis=1
reward_diff = tf.reduce_mean(
tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
self.reward_diff = tf.reduce_sum(
tf.abs(self.policy.bisim_pred_reward - self.policy.pred_reward), axis=1
)
bisim_diff = 0.99 * self.predict_diff + self.reward_diff
self.encode_dist = tf.reduce_sum(
tf.abs(self.policy.encoder - self.policy.bisim_encoder), axis=1
predict_diff = 0.99 * predict_diff + tf.abs(reward_diff)
encode_dist = tf.reduce_mean(
tf.abs(self.policy.encoder - self.policy.bisim_encoder)
# tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
)
self.encode_dist_val = encode_dist
self.predict_diff_val = predict_diff
self.bisim_loss = tf.squared_difference(encode_dist, predict_diff)
self.bisim_loss = tf.reduce_mean(tf.squared_difference(self.encode_dist, bisim_diff))
self.loss = (
self.policy_loss

}
update_vals = self._execute_model(feed_dict, self.bisim_update_dict)
# print("model difference:", self.policy.sess.run(self.predict_diff_val, feed_dict=feed_dict))
# print("encoder distance:", self.policy.sess.run(self.encode_dist_val, feed_dict=feed_dict))
# print("model difference:", self.policy.sess.run(self.predict_diff, feed_dict=feed_dict))
# print("reward difference:", self.policy.sess.run(self.reward_diff, feed_dict=feed_dict))
# print("encoder distance:", self.policy.sess.run(self.encode_dist, feed_dict=feed_dict))
# print("bisim loss:", self.policy.sess.run(self.bisim_loss, feed_dict=feed_dict))
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():

36
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


return action_mask
def _compute_reward(self, name: str, done: bool) -> float:
reward = 0.0
for _pos, goal in zip(self.positions[name], self.goal[name]):
# if abs(_pos - self.goal[name]) < 0.1:
# reward += SUCCESS_REWARD
# else:
# reward -= TIME_PENALTY
reward -= abs(_pos - goal) / 10 #np.exp(-abs(_pos - goal))
# reward = 0.0
# for _pos, goal in zip(self.positions[name], self.goal[name]):
# # if abs(_pos - self.goal[name]) < 0.1:
# # reward += SUCCESS_REWARD
# # else:
# # reward -= TIME_PENALTY
# reward -= abs(_pos - goal) / 10 #np.exp(-abs(_pos - goal))
# if done and self.step_count[name] < self.horizon[name]:
# reward = SUCCESS_REWARD
# # for _pos in self.positions[name]:
# # if self.goal_type == "easy":
# # reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# # self.positions[name]
# # )
# # elif self.goal_type == "hard":
# # reward += np.exp(-abs(_pos - self.goal[name]))
# else:
# reward = -TIME_PENALTY
if done and self.step_count[name] < self.horizon[name]:
reward = 0.0 #SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# self.positions[name]
# )
# elif self.goal_type == "hard":
# reward += np.exp(-abs(_pos - self.goal[name]))
else:
reward = -TIME_PENALTY
return reward

正在加载...
取消
保存