浏览代码

simple env test

/develop/bisim-review
yanchaosun 4 年前
当前提交
7e3216ae
共有 8 个文件被更改,包括 1288 次插入48 次删除
  1. 7
      config/ppo_transfer/CrawlerStaticOpbuffer.yaml
  2. 3
      config/ppo_transfer/WalkerStaticSingle.yaml
  3. 4
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  4. 11
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  5. 51
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  6. 76
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py
  7. 837
      ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
  8. 347
      ml-agents/mlagents/trainers/tests/encoder_test.ipynb

7
config/ppo_transfer/CrawlerStaticOpbuffer.yaml


learning_rate_schedule: linear
encoder_layers: 2
policy_layers: 2
value_layers: 2
forward_layers: 1
value_layers: 3
forward_layers: 2
use_var_predict: true
in_batch_alter: true
use_op_buffer: true
network_settings:
normalize: true
hidden_units: 512

3
config/ppo_transfer/WalkerStaticSingle.yaml


hyperparameters:
batch_size: 2048
buffer_size: 20480
learning_rate: 0.0003
learning_rate: 0.0001
beta: 0.005
epsilon: 0.2
lambd: 0.95

feature_size: 64
reuse_encoder: true
in_epoch_alter: true
use_op_buffer: true
network_settings:
normalize: true
hidden_units: 512

4
ml-agents/mlagents/trainers/policy/transfer_policy.py


for i in range(forward_layers):
hidden = tf.layers.dense(
hidden,
self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
self.h_size,
# * (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),

11
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Model Loss": "model_loss",
"Losses/Reward Loss": "reward_loss",
if self.predict_return:
self.stats_name_to_update_name.update({
"Losses/Reward Loss": "reward_loss",
})
if self.policy.use_recurrent:
self.m_size = self.policy.m_size
self.memory_in = tf.placeholder(

# self.policy.get_encoder_weights()
for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
# if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
self.num_updates += 1
return update_stats

def _construct_feed_dict(
self, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
# print(mini_batch.keys())
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)

if self.policy.vis_obs_size > 0:
for i, _ in enumerate(self.policy.visual_in):
feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
feed_dict[self.policy.visual_next[i]] = mini_batch["next_visual_obs%d" % i]
if self.policy.use_recurrent:
feed_dict[self.policy.memory_in] = [
mini_batch["memory"][i]

51
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


feature_size=2,
reuse_encoder=True,
in_epoch_alter=True,
in_batch_alter=False,
# in_batch_alter=True,
policy_layers=1
# policy_layers=0,
# value_layers=0,
# conv_thres=1e-4,
# predict_return=True
# separate_policy_train=True,
# separate_value_train=True
# separate_value_net=True,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,

tc.start_learning(env_manager)
# debug_writer.write2file(model_dir+"/reward.txt")
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
# if (
# success_threshold is not None
# ): # For tests where we are just checking setup and not reward
# processed_rewards = [
# reward_processor(rewards) for rewards in env.final_rewards.values()
# ]
# assert all(not math.isnan(reward) for reward in processed_rewards)
# assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich", run_id="modelbased_rich_5e-4", seed=1337):
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="normal", run_id="model_normal", seed=0):
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"
config.hyperparameters, batch_size=64, buffer_size=640, learning_rate=5.0e-4,
config.hyperparameters, batch_size=120, buffer_size=12000, learning_rate=5.0e-3
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich", run_id="transfer_rich_iealter_retrain-enc_5e-4", seed=1337):
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich2", run_id="transfer_rich2_from-rich1", seed=1337):
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.1,
num_vector=2, obs_spec_type=obs_spec_type, goal_type="hard"
config.hyperparameters, batch_size=64, buffer_size=640, use_transfer=True,
transfer_path="./transfer_results/modelbased_normal_opbuf_ibalter_s2/Simple",
use_op_buffer=True, in_epoch_alter=True, learning_rate=5.0e-4, train_policy=False,
train_value=False, train_model=False
config.hyperparameters, batch_size=120, buffer_size=12000, use_transfer=True,
transfer_path="./transfer_results/model_rich1_s0/Simple",
use_op_buffer=True, in_epoch_alter=True, in_batch_alter=False, learning_rate=5e-4,
train_policy=False, train_value=False, train_model=False, feature_size=2
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=200000, summary_freq=5000)
# test_2d_model(config=PPO_CONFIG, run_id="ppo_normal", seed=0)
test_2d_transfer(seed=0)
# for i in range(5):
# test_2d_model(seed=i)

76
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
action_size=1,
obs_spec_type="normal" # normal: (x,y); rich: (x+y, x-y, x*y)
obs_spec_type="normal", # normal: (x,y); rich: (x+y, x-y, x*y)
goal_type="hard", # easy: 1 or -1; hard: uniformly random
):
super().__init__()
self.discrete = use_discrete

self.vec_obs_size = vec_obs_size
self.obs_spec_type = obs_spec_type
self.goal_type = goal_type
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(),

self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}
self.random = random.Random(str(self.behavior_spec))
self.goal: Dict[str, int] = {}
self.goal: Dict[str, List[float]] = {}
self.num_steps: Dict[str, int] = {}
self.horizon: Dict[str, int] = {}
self.action = {}
self.rewards: Dict[str, float] = {}
self.final_rewards: Dict[str, List[float]] = {}

for name in self.names:
self.agent_id[name] = 0
self.goal[name] = self.random.choice([-1, 1])
if self.goal_type == "easy":
self.goal[name] = self.random.choice([-1, 1])
elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.uniform(-1,1))
self.step_count[name] = 0
self.horizon[name] = 5000
print(self.goal)
def _make_obs_spec(self) -> List[Any]:
obs_spec: List[Any] = []

for _ in range(self.num_vector):
obs_spec.append((self.vec_obs_size,))
# composed position
if self.obs_spec_type == "rich":
if "rich" in self.obs_spec_type:
def _make_obs(self, value: float) -> List[np.ndarray]:
def _make_obs(self, value: List[float]) -> List[np.ndarray]:
for _ in range(self.num_vector):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value)
for i in range(self.num_vector):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value[i])
elif self.obs_spec_type == "rich":
elif self.obs_spec_type == "rich1":
for name in self.names:
i = self.positions[name][0]
j = self.positions[name][1]

elif self.obs_spec_type == "rich2":
for name in self.names:
i = self.positions[name][0]
j = self.positions[name][1]
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (2*i+j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (2*i-j))
for _ in range(self.num_visual):
obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs

self.positions[name][i] = clamp(self.positions[name][i], -1, 1)
self.step_count[name] += 1
# Both must be in 1.0 to be done
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
# print(self.positions)
# print(self.positions[name], end="")
if self.goal_type == "easy":
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) or self.step_count[name] >= self.horizon[name]
elif self.goal_type == "hard":
# done = self.step_count[name] >= self.horizon[name]
done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
or self.step_count[name] >= self.horizon[name]
# if done:
# print(self.positions[name], end=" done ")
return done
def _generate_mask(self):

return action_mask
def _compute_reward(self, name: str, done: bool) -> float:
# reward = 0.0
# for _pos in self.positions[name]:
# if abs(_pos - self.goal[name]) < 0.1:
# reward += SUCCESS_REWARD
# else:
# reward -= TIME_PENALTY
# reward += np.exp(-abs(_pos - self.goal[name]))
reward = 0.0
for _pos in self.positions[name]:
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
self.positions[name]
)
reward = SUCCESS_REWARD
# for _pos in self.positions[name]:
# if self.goal_type == "easy":
# reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
# self.positions[name]
# )
# elif self.goal_type == "hard":
# reward += np.exp(-abs(_pos - self.goal[name]))
self.goal[name] = self.random.choice([-1, 1])
if self.goal_type == "easy":
self.goal[name] = self.random.choice([-1, 1])
elif self.goal_type == "hard":
self.goal[name] = []
for _ in range(self.num_vector):
self.goal[name].append(self.random.uniform(-1,1))
# print("new goal:", self.goal[name])
def _make_batched_step(
self, name: str, done: bool, reward: float

837
ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
文件差异内容过多而无法显示
查看文件

347
ml-agents/mlagents/trainers/tests/encoder_test.ipynb
文件差异内容过多而无法显示
查看文件

正在加载...
取消
保存