浏览代码

small fix

/develop/bisim-review
yanchaosun 4 年前
当前提交
0e2f6e19
共有 6 个文件被更改,包括 164 次插入41 次删除
  1. 13
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 2
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 5
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 120
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  5. 29
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  6. 36
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

13
ml-agents/mlagents/trainers/policy/transfer_policy.py


feature_size,
name="latent",
reuse=reuse_encoder,
activation=ModelUtils.swish,
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent_targ

hidden_stream,
feature_size,
name="latent",
activation=ModelUtils.swish,
# activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
)
return latent

with self.graph.as_default():
pol = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "policy/mu/bias:0")
print("policy:", self.sess.run(pol))
pred = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "predict")
print("predict:", self.sess.run(pred))
rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
print("reward:", self.sess.run(rew))
def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
encoded_state_list = []

# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.reward_loss = tf.reduce_mean(
self.reward_loss = tf.clip_by_value(tf.reduce_mean(
)
), 1e-10,1.0)
def create_bisim_model(
self,

2
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


update_vals = self._execute_model(feed_dict, self.ppo_update_dict)
if self.use_bisim:
batch1 = copy.deepcopy(batch)
batch.shuffle(sequence_length=self.batch_size)
batch.shuffle(sequence_length=10)
batch2 = copy.deepcopy(batch)
bisim_stats = self.update_encoder(batch1, batch2)
update_vals.update(self._execute_model(feed_dict, self.model_update_dict))

5
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


# if self.num_update >= 10:
# self.train_model = False
print(stat, np.mean(stat_list))
self.policy.get_encoder_weights()
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()

# self.off_policy_buffer.reset_agent()
if self.off_policy_buffer.num_experiences > 10 * self.hyperparameters.buffer_size:
if self.off_policy_buffer.num_experiences > 4 * self.hyperparameters.buffer_size:
int(5 * self.hyperparameters.buffer_size)
int(2 * self.hyperparameters.buffer_size)
)
print("truncate")
# self.train_model = False

120
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

29
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=2, hidden_units=16),
network_settings=NetworkSettings(num_layers=2, hidden_units=16, normalize=True),
summary_freq=100,
max_steps=1000,
threaded=False,

config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3,
use_bisim=True, predict_return=True,
# separate_value_train=True, separate_policy_train=True,
use_var_predict=True, with_prior=True, use_op_buffer=False, in_epoch_alter=False, in_batch_alter=True,
policy_layers=1, value_layers=1, encoder_layers=1, feature_size=4,
use_var_predict=True, with_prior=True, use_op_buffer=True, in_epoch_alter=False, in_batch_alter=True,
policy_layers=2, value_layers=2, encoder_layers=0, feature_size=2,
#use_inverse_model=True
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)

)
new_hyperparams = attr.evolve(
config.hyperparameters, batch_size=120, buffer_size=12000, use_transfer=True,
transfer_path=transfer_from, separate_policy_train=True,
transfer_path=transfer_from, #separate_policy_train=True, separate_value_train=True,
train_policy=True, train_value=True, train_model=False, feature_size=4,
use_var_predict=True, with_prior=True, policy_layers=0, load_policy=False,
load_value=False, predict_return=True, value_layers=0, encoder_layers=2,
train_policy=False, train_value=False, train_model=False, feature_size=2,
use_var_predict=True, with_prior=True, policy_layers=2, load_policy=True,
load_value=True, predict_return=True, value_layers=2, encoder_layers=0,
use_bisim=True,
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=300000, summary_freq=5000)

if __name__ == "__main__":
for obs in ["normal", "rich1", "rich2"]: # ["normal", "rich1", "rich2"]:
test_2d_model(seed=0, obs_spec_type=obs, run_id="model_" + obs \
+ "_f4_pv-l0_rew_bisim_order-ibalter_noreuse-soft0.1_noop_conlr")
# for obs in ["normal"]: # ["normal", "rich1", "rich2"]:
# test_2d_model(seed=0, obs_spec_type=obs, run_id="model_" + obs \
# + "_f2_pv-l2_linear-rew_ibalter_conlr_enc-l0-op4_bisim")
# for obs in ["normal"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",
# transfer_from="./transfer_results/model_"+ obs +"_f4_pv-l0_rew_bisim_order-ibalter_noreuse-soft0.1_nostop-op10_linlr_s0/Simple",
# run_id="transfer_rich1_f4_pv-l0_soft_ibalter_sepp_from_" + obs)
for obs in ["normal"]:
test_2d_transfer(seed=0, obs_spec_type="normal",
transfer_from="./transfer_results/model_"+ obs +"_f2_pv-l2_linear-rew_ibalter_conlr_enc-l0-op4_bisim_s0/Simple",
run_id="transfer_normal_f2_pv-l2_ibalter_fixbisim_from_" + obs)
# for obs in ["normal"]:
# test_2d_transfer(seed=0, obs_spec_type="rich1",

36
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


self.action[name] = None
self.step_result[name] = None
self.step_count[name] = 0
self.horizon[name] = 5000
self.horizon[name] = 1000
print(self.goal)
def _make_obs_spec(self) -> List[Any]:

if self.goal_type == "easy":
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) or self.step_count[name] >= self.horizon[name]
elif self.goal_type == "hard":
# done = self.step_count[name] >= self.horizon[name]
done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
or self.step_count[name] >= self.horizon[name]
done = self.step_count[name] >= self.horizon[name]
# done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
# or self.step_count[name] >= self.horizon[name]
# if done:
# print(self.positions[name], end=" done ")
return done

return action_mask
def _compute_reward(self, name: str, done: bool) -> float:
# reward = 0.0
# for _pos in self.positions[name]:
reward = 0.0
for _pos, goal in zip(self.positions[name], self.goal[name]):
# reward += np.exp(-abs(_pos - self.goal[name]))
reward += 2 - abs(_pos - goal) #np.exp(-abs(_pos - goal))
if done:
reward = TIME_PENALTY #SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# self.positions[name]
# )
# elif self.goal_type == "hard":
# reward += np.exp(-abs(_pos - self.goal[name]))
else:
reward = -TIME_PENALTY
# if done:
# reward = SUCCESS_REWARD
# # for _pos in self.positions[name]:
# # if self.goal_type == "easy":
# # reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# # self.positions[name]
# # )
# # elif self.goal_type == "hard":
# # reward += np.exp(-abs(_pos - self.goal[name]))
# else:
# reward = -TIME_PENALTY
return reward

正在加载...
取消
保存