浏览代码

Develop model transfer test (#4214)

* test env, and code integration

* delete results
/develop/bisim-review
GitHub 4 年前
当前提交
839eb2cb
共有 6 个文件被更改,包括 560 次插入80 次删除
  1. 29
      ml-agents/mlagents/trainers/policy/transfer_policy.py
  2. 136
      ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
  3. 4
      ml-agents/mlagents/trainers/ppo_transfer/trainer.py
  4. 10
      ml-agents/mlagents/trainers/settings.py
  5. 223
      ml-agents/mlagents/trainers/tests/test_simple_transfer.py
  6. 238
      ml-agents/mlagents/trainers/tests/transfer_test_envs.py

29
ml-agents/mlagents/trainers/policy/transfer_policy.py


# We assume the first thing created in the graph is the Policy. If
# already populated, don't create more tensors.
return
self.create_input_placeholders()
self.current_action = tf.placeholder(
shape=[None, sum(self.act_size)], dtype=tf.float32, name="current_action"

# slim.model_analyzer.analyze_vars(self.trainable_variables, print_info=True)
def load_graph_partial(self, path: str, transfer_type="dynamics"):
load_nets = {"dynamics": ["predict"],
def load_graph_partial(self, path: str, transfer_type="dynamics", load_model=True, load_policy=True,
load_value=True):
load_nets = {"dynamics": [],
if load_model:
load_nets["dynamics"].append("predict")
if load_policy:
load_nets["dynamics"].append("policy")
if load_value:
load_nets["dynamics"].append("value")
with self.graph.as_default():
for net in load_nets[transfer_type]:
variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, net)

self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
if var_predict:

hidden,
self.feature_size,
name="latent",
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
squared_difference = 0.5 * tf.reduce_sum(

self.h_size
* (self.vis_obs_size + int(self.vec_obs_size > 0)),
name="hidden_{}".format(i),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
activation=ModelUtils.swish,
kernel_initializer=tf.initializers.variance_scaling(1.0),
# activation=ModelUtils.swish,
# kernel_initializer=tf.initializers.variance_scaling(1.0),
)
self.reward_loss = tf.reduce_mean(
tf.squared_difference(self.pred_reward, self.current_reward)

136
ml-agents/mlagents/trainers/ppo_transfer/optimizer.py


self.in_batch_alter = hyperparameters.in_batch_alter
self.in_epoch_alter = hyperparameters.in_epoch_alter
self.op_buffer = hyperparameters.use_op_buffer
self.train_type = hyperparameters.train_type
self.train_encoder = hyperparameters.train_encoder
self.train_model = hyperparameters.train_model
self.train_policy = hyperparameters.train_policy
self.train_value = hyperparameters.train_value
# Transfer
self.use_transfer = hyperparameters.use_transfer
self.transfer_path = hyperparameters.transfer_path #"results/BallSingle_nosep_cmodel_small/3DBall"

with tf.variable_scope("value"):
if policy.use_continuous_act:
self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
if hyperparameters.separate_value_net:
self._create_cc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
else:
self._create_cc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
if hyperparameters.separate_value_net:
self._create_dc_critic_old(h_size, hyperparameters.value_layers, vis_encode_type)
else:
self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
with tf.variable_scope("optimizer/"):
self.learning_rate = ModelUtils.create_schedule(

self.policy.initialize_or_load()
if self.use_transfer:
self.policy.load_graph_partial(self.transfer_path, self.transfer_type)
self.policy.load_graph_partial(self.transfer_path, self.transfer_type,
hyperparameters.load_model, hyperparameters.load_policy, hyperparameters.load_value)
self.policy.get_encoder_weights()
self.policy.get_policy_weights()

)
def _create_ppo_optimizer_ops(self):
if self.use_transfer:
if self.transfer_type == "dynamics":
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
elif self.train_type == "policy":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
print("trainable", train_vars)
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
elif self.transfer_type == "observation":
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "policy":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
else:
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
print("trainable", train_vars)
# if self.use_transfer:
# if self.transfer_type == "dynamics":
# if self.train_type == "all":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
# elif self.train_type == "encoding":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# elif self.train_type == "policy":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# print("trainable", train_vars)
# # train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/mu")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy/log_std")
# # train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value/extrinsic_value")
# elif self.transfer_type == "observation":
# if self.train_type == "all":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
# elif self.train_type == "policy":
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
# + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
# # + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding/latent")
# else:
# train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
train_vars = []
if self.train_encoder:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
if self.train_policy:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
if self.train_value:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
print("trainable", train_vars)
self.tf_optimizer = self.create_optimizer_op(self.learning_rate)
self.grads = self.tf_optimizer.compute_gradients(self.loss, var_list=train_vars)

def _init_alter_update(self):
if self.use_alter:
policy_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
policy_train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
model_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.loss, var_list=policy_train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.loss, var_list=policy_train_vars)
self.model_optimizer = self.create_optimizer_op(self.learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.loss, var_list=model_train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.loss, var_list=model_train_vars)
else:
if self.train_type == "all":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
elif self.train_type == "encoding":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
# train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target_enc")
elif self.train_type == "policy":
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "inverse") \
+ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
train_vars = []
if self.train_encoder:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoding")
if self.train_model:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predict")
if self.train_policy:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "policy")
if self.train_value:
train_vars += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value")
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.ppo_optimizer = self.create_optimizer_op(self.learning_rate)
self.ppo_grads = self.ppo_optimizer.compute_gradients(self.ppo_loss, var_list=train_vars)
self.ppo_update_batch = self.ppo_optimizer.minimize(self.ppo_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.model_optimizer = self.create_optimizer_op(self.model_learning_rate)
self.model_grads = self.model_optimizer.compute_gradients(self.model_loss, var_list=train_vars)
self.model_update_batch = self.model_optimizer.minimize(self.model_loss, var_list=train_vars)
self.ppo_update_dict.update(
{

}
)
self.model_update_dict.update(
{
"model_loss": self.model_loss,

for stat_name, update_name in stats_needed.items():
if update_name in update_vals.keys():
update_stats[stat_name] = update_vals[update_name]
self.num_updates += 1
return update_stats

feed_dict[self.memory_in] = self._make_zero_mem(
self.m_size, mini_batch.num_experiences
)
# print(self.policy.sess.run(self.policy.encoder, feed_dict={self.policy.vector_in: mini_batch["vector_obs"]}))
return feed_dict
def _create_cc_critic_old(

4
ml-agents/mlagents/trainers/ppo_transfer/trainer.py


buffer = self.update_buffer
max_num_batch = buffer_length // batch_size
for i in range(0, max_num_batch * batch_size, batch_size):
update_stats = self.optimizer.update(
buffer.make_mini_batch(i, i + batch_size), n_sequences
update_stats = self.optimizer.update_part(
buffer.make_mini_batch(i, i + batch_size), n_sequences, "policy"
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)

10
ml-agents/mlagents/trainers/settings.py


separate_value_train: bool = False
separate_policy_train: bool = False
seprate_value_net: bool = False
separate_value_net: bool = False
use_var_encoder: bool = False
use_var_predict: bool = False
with_prior: bool = False

in_batch_alter: bool = False
in_epoch_alter: bool = False
use_op_buffer: bool = False
train_type: str = "all"
train_encoder: bool = True
train_model: bool = True
train_policy: bool = True
train_value: bool = True
feature_size: int = 16
# Transfer

transfer_path: str = ""
transfer_type: str = "dynamics"
load_model: bool = True
load_value: bool = True
load_policy: bool = True
# Network
encoder_layers: int = 1

223
ml-agents/mlagents/trainers/tests/test_simple_transfer.py


import math
import tempfile
import pytest
import numpy as np
import attr
from typing import Dict
from mlagents.trainers.tests.transfer_test_envs import SimpleTransferEnvironment
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.trainer_util import TrainerFactory
from mlagents.trainers.simple_env_manager import SimpleEnvManager
from mlagents.trainers.demo_loader import write_demo
from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary, TensorboardWriter, CSVWriter
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,
PPOTransferSettings,
SACSettings,
NetworkSettings,
SelfPlaySettings,
BehavioralCloningSettings,
GAILSettings,
TrainerType,
RewardSignalType,
)
from mlagents.trainers.models import EncoderType, ScheduleType
from mlagents_envs.side_channel.environment_parameters_channel import (
EnvironmentParametersChannel,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
DemonstrationMetaProto,
)
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
BRAIN_NAME = "Simple"
PPO_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=2, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
)
SAC_CONFIG = TrainerSettings(
trainer_type=TrainerType.SAC,
hyperparameters=SACSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=8,
buffer_init_steps=100,
buffer_size=5000,
tau=0.01,
init_entcoef=0.01,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=16),
summary_freq=100,
max_steps=1000,
threaded=False,
)
Transfer_CONFIG = TrainerSettings(
trainer_type=TrainerType.PPO_Transfer,
hyperparameters=PPOTransferSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
feature_size=2,
reuse_encoder=True,
in_epoch_alter=True,
in_batch_alter=False,
use_op_buffer=True,
policy_layers=1
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
)
# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list pf all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors shuld be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
rewards_to_use = rewards[-last_n_rewards:]
# For debugging tests
print("Last {} rewards:".format(last_n_rewards), rewards_to_use)
return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
class DebugWriter(StatsWriter):
"""
Print to stdout so stats can be viewed in pytest
"""
def __init__(self):
self._last_reward_summary: Dict[str, float] = {}
self.stats = {}
def get_last_rewards(self):
return self._last_reward_summary
def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
print(step, val, stats_summary.mean)
self.stats[step] = stats_summary.mean
self._last_reward_summary[category] = stats_summary.mean
def write2file(self, filename):
with open(filename, "w") as reward_file:
for step in self.stats.keys():
reward_file.write(str(step) + ":" + str(self.stats[step]) + "\n")
def _check_environment_trains(
env,
trainer_config,
reward_processor=default_reward_processor,
meta_curriculum=None,
success_threshold=0.9,
env_manager=None,
run_id="id",
seed=1337
):
# Create controller and begin training.
model_dir = "./transfer_results/" + run_id
StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
debug_writer = DebugWriter()
StatsReporter.add_writer(debug_writer)
csv_writer = CSVWriter(
model_dir,
required_fields=[
"Environment/Cumulative Reward",
"Environment/Episode Length",
],
)
tb_writer = TensorboardWriter(
model_dir, clear_past_data=True
)
StatsReporter.add_writer(tb_writer)
StatsReporter.add_writer(csv_writer)
if env_manager is None:
env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
trainer_factory = TrainerFactory(
trainer_config=trainer_config,
output_path=model_dir,
train_model=True,
load_model=False,
seed=seed,
meta_curriculum=meta_curriculum,
multi_gpu=False,
)
tc = TrainerController(
trainer_factory=trainer_factory,
output_path=model_dir,
run_id=run_id,
meta_curriculum=meta_curriculum,
train=True,
training_seed=seed,
)
# Begin training
tc.start_learning(env_manager)
# debug_writer.write2file(model_dir+"/reward.txt")
if (
success_threshold is not None
): # For tests where we are just checking setup and not reward
processed_rewards = [
reward_processor(rewards) for rewards in env.final_rewards.values()
]
assert all(not math.isnan(reward) for reward in processed_rewards)
assert all(reward > success_threshold for reward in processed_rewards)
def test_2d_model(config=Transfer_CONFIG, obs_spec_type="rich", run_id="modelbased_rich_5e-4", seed=1337):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
)
new_hyperparams = attr.evolve(
config.hyperparameters, batch_size=64, buffer_size=640, learning_rate=5.0e-4,
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
def test_2d_transfer(config=Transfer_CONFIG, obs_spec_type="rich", run_id="transfer_rich_iealter_retrain-enc_5e-4", seed=1337):
env = SimpleTransferEnvironment(
[BRAIN_NAME], use_discrete=False, action_size=2, step_size=0.8, num_vector=2, obs_spec_type=obs_spec_type
)
new_hyperparams = attr.evolve(
config.hyperparameters, batch_size=64, buffer_size=640, use_transfer=True,
transfer_path="./transfer_results/modelbased_normal_opbuf_ibalter_s2/Simple",
use_op_buffer=True, in_epoch_alter=True, learning_rate=5.0e-4, train_policy=False,
train_value=False, train_model=False
)
config = attr.evolve(config, hyperparameters=new_hyperparams, max_steps=10000)
_check_environment_trains(env, {BRAIN_NAME: config}, run_id=run_id + "_s" + str(seed), seed=seed)
if __name__ == "__main__":
# test_2d_model(seed=0)
test_2d_transfer(seed=0)
# for i in range(5):
# test_2d_model(seed=i)

238
ml-agents/mlagents/trainers/tests/transfer_test_envs.py


import random
from typing import Dict, List, Any, Tuple
import numpy as np
from mlagents_envs.base_env import (
BaseEnv,
BehaviorSpec,
DecisionSteps,
TerminalSteps,
ActionType,
BehaviorMapping,
)
from mlagents_envs.tests.test_rpc_utils import proto_from_steps_and_action
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
AgentInfoActionPairProto,
)
OBS_SIZE = 1
VIS_OBS_SIZE = (20, 20, 3)
STEP_SIZE = 0.1
TIME_PENALTY = 0.01
MIN_STEPS = int(1.0 / STEP_SIZE) + 1
SUCCESS_REWARD = 1.0 + MIN_STEPS * TIME_PENALTY
def clamp(x, min_val, max_val):
return max(min_val, min(x, max_val))
class SimpleTransferEnvironment(BaseEnv):
"""
Very simple "game" - the agent has a position on [-1, 1], gets a reward of 1 if it reaches 1, and a reward of -1 if
it reaches -1. The position is incremented by the action amount (clamped to [-step_size, step_size]).
"""
def __init__(
self,
brain_names,
use_discrete,
step_size=STEP_SIZE,
num_visual=0,
num_vector=1,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
action_size=1,
obs_spec_type="normal" # normal: (x,y); rich: (x+y, x-y, x*y)
):
super().__init__()
self.discrete = use_discrete
self.num_visual = num_visual
self.num_vector = num_vector
self.vis_obs_size = vis_obs_size
self.vec_obs_size = vec_obs_size
self.obs_spec_type = obs_spec_type
action_type = ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS
self.behavior_spec = BehaviorSpec(
self._make_obs_spec(),
action_type,
tuple(2 for _ in range(action_size)) if use_discrete else action_size,
)
self.action_size = action_size
self.names = brain_names
self.positions: Dict[str, List[float]] = {}
self.step_count: Dict[str, float] = {}
self.random = random.Random(str(self.behavior_spec))
self.goal: Dict[str, int] = {}
self.action = {}
self.rewards: Dict[str, float] = {}
self.final_rewards: Dict[str, List[float]] = {}
self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
self.agent_id: Dict[str, int] = {}
self.step_size = step_size # defines the difficulty of the test
for name in self.names:
self.agent_id[name] = 0
self.goal[name] = self.random.choice([-1, 1])
self.rewards[name] = 0
self.final_rewards[name] = []
self._reset_agent(name)
self.action[name] = None
self.step_result[name] = None
def _make_obs_spec(self) -> List[Any]:
obs_spec: List[Any] = []
# goal
for _ in range(self.num_vector):
obs_spec.append((self.vec_obs_size,))
for _ in range(self.num_visual):
obs_spec.append(self.vis_obs_size)
# position
if self.obs_spec_type == "normal":
for _ in range(self.num_vector):
obs_spec.append((self.vec_obs_size,))
# composed position
if self.obs_spec_type == "rich":
for _ in range(self.num_vector+1):
obs_spec.append((self.vec_obs_size,))
print("obs_spec:", obs_spec)
return obs_spec
def _make_obs(self, value: float) -> List[np.ndarray]:
obs = []
for _ in range(self.num_vector):
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * value)
if self.obs_spec_type == "normal":
for name in self.names:
for i in self.positions[name]:
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * i)
elif self.obs_spec_type == "rich":
for name in self.names:
i = self.positions[name][0]
j = self.positions[name][1]
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i+j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i-j))
obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (i*j))
for _ in range(self.num_visual):
obs.append(np.ones((1,) + self.vis_obs_size, dtype=np.float32) * value)
return obs
@property
def behavior_specs(self):
behavior_dict = {}
for n in self.names:
behavior_dict[n] = self.behavior_spec
return BehaviorMapping(behavior_dict)
def set_action_for_agent(self, behavior_name, agent_id, action):
pass
def set_actions(self, behavior_name, action):
self.action[behavior_name] = action
def get_steps(self, behavior_name):
return self.step_result[behavior_name]
def _take_action(self, name: str) -> bool:
deltas = []
for _act in self.action[name][0]:
if self.discrete:
deltas.append(1 if _act else -1)
else:
deltas.append(_act)
for i, _delta in enumerate(deltas):
_delta = clamp(_delta, -self.step_size, self.step_size)
self.positions[name][i] += _delta
self.positions[name][i] = clamp(self.positions[name][i], -1, 1)
self.step_count[name] += 1
# Both must be in 1.0 to be done
done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name])
# print(self.positions)
return done
def _generate_mask(self):
if self.discrete:
# LL-Python API will return an empty dim if there is only 1 agent.
ndmask = np.array(2 * self.action_size * [False], dtype=np.bool)
ndmask = np.expand_dims(ndmask, axis=0)
action_mask = [ndmask]
else:
action_mask = None
return action_mask
def _compute_reward(self, name: str, done: bool) -> float:
if done:
reward = 0.0
for _pos in self.positions[name]:
reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
self.positions[name]
)
else:
reward = -TIME_PENALTY
return reward
def _reset_agent(self, name):
self.goal[name] = self.random.choice([-1, 1])
self.positions[name] = [0.0 for _ in range(self.action_size)]
self.step_count[name] = 0
self.rewards[name] = 0
self.agent_id[name] = self.agent_id[name] + 1
def _make_batched_step(
self, name: str, done: bool, reward: float
) -> Tuple[DecisionSteps, TerminalSteps]:
m_vector_obs = self._make_obs(self.goal[name])
m_reward = np.array([reward], dtype=np.float32)
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
action_mask = self._generate_mask()
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
terminal_step = TerminalSteps.empty(self.behavior_spec)
if done:
self.final_rewards[name].append(self.rewards[name])
self._reset_agent(name)
new_vector_obs = self._make_obs(self.goal[name])
(
new_reward,
new_done,
new_agent_id,
new_action_mask,
) = self._construct_reset_step(name)
decision_step = DecisionSteps(
new_vector_obs, new_reward, new_agent_id, new_action_mask
)
terminal_step = TerminalSteps(
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
)
return (decision_step, terminal_step)
def _construct_reset_step(
self, name: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
new_reward = np.array([0.0], dtype=np.float32)
new_done = np.array([False], dtype=np.bool)
new_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
new_action_mask = self._generate_mask()
return new_reward, new_done, new_agent_id, new_action_mask
def step(self) -> None:
assert all(action is not None for action in self.action.values())
for name in self.names:
done = self._take_action(name)
reward = self._compute_reward(name, done)
self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(name, done, reward)
def reset(self) -> None: # type: ignore
for name in self.names:
self._reset_agent(name)
self.step_result[name] = self._make_batched_step(name, False, 0.0)
@property
def reset_parameters(self) -> Dict[str, str]:
return {}
def close(self):
pass
正在加载...
取消
保存