reward_signal_steps_per_update: 20.0
encoder_layers: 1
policy_layers: 2
forward_layers: 2
forward_layers: 0
feature_size: 64
action_feature_size: 16
feature_size: 128
action_feature_size: 32
separate_policy_train: true
separate_policy_net: true
separate_model_train: true
reuse_encoder: false
)
self.model_learning_rate = ModelUtils.create_schedule(
hyperparameters.model_schedule,
lr,
lr * 10,
self.policy.global_step,
int(max_step),
min_value=1e-10,