浏览代码

ActionFlattener Refactor

/hybrid-action-rewardsignals
Andrew Cohen 4 年前
当前提交
6e23bafd
共有 5 个文件被更改,包括 88 次插入77 次删除
  1. 124
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py
  2. 5
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py
  3. 2
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  4. 2
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  5. 32
      ml-agents/mlagents/trainers/torch/utils.py

124
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_gail.py


GAILRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import GAILSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,

SEED = [42]
@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)]
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
gail_rp = GAILRewardProvider(behavior_spec, gail_settings)
assert gail_rp.name == "GAIL"
#@pytest.mark.parametrize(
# "behavior_spec", [BehaviorSpec([(8,)], ActionSpec(2, tuple()))]
#)
#def test_construction(behavior_spec: BehaviorSpec) -> None:
# gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
# gail_rp = GAILRewardProvider(behavior_spec, gail_settings)
# assert gail_rp.name == "GAIL"
@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2)]
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
gail_rp = create_reward_provider(
RewardSignalType.GAIL, behavior_spec, gail_settings
)
assert gail_rp.name == "GAIL"
#@pytest.mark.parametrize(
# "behavior_spec", [BehaviorSpec([(8,)], ActionSpec(2, tuple()))]
#)
#def test_factory(behavior_spec: BehaviorSpec) -> None:
# gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
# gail_rp = create_reward_provider(
# RewardSignalType.GAIL, behavior_spec, gail_settings
# )
# assert gail_rp.name == "GAIL"
@pytest.mark.parametrize("seed", SEED)

BehaviorSpec([(8,), (24, 26, 1)], ActionType.CONTINUOUS, 2),
BehaviorSpec([(50,)], ActionType.DISCRETE, (2, 3, 3, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)),
BehaviorSpec([(8,), (24, 26, 1)], ActionSpec(2, tuple())),
BehaviorSpec([(50,)], ActionSpec(0, (2, 3, 3, 3))),
BehaviorSpec([(10,)], ActionSpec(0, (20,))),
],
)
@pytest.mark.parametrize("use_actions", [False, True])

assert (
reward_policy < init_reward_policy
) # Non-expert reward getting worse as network trains
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(8,)], ActionType.CONTINUOUS, 2),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3, 3, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (20,)),
],
)
@pytest.mark.parametrize("use_actions", [False, True])
@patch(
"mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer"
)
def test_reward_decreases_vail(
demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
) -> None:
np.random.seed(seed)
torch.manual_seed(seed)
buffer_expert = create_agent_buffer(behavior_spec, 1000)
buffer_policy = create_agent_buffer(behavior_spec, 1000)
demo_to_buffer.return_value = None, buffer_expert
gail_settings = GAILSettings(
demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions
)
DiscriminatorNetwork.initial_beta = 0.0
# we must set the initial value of beta to 0 for testing
# If we do not, the kl-loss will dominate early and will block the estimator
gail_rp = create_reward_provider(
RewardSignalType.GAIL, behavior_spec, gail_settings
)
for _ in range(200):
gail_rp.update(buffer_policy)
reward_expert = gail_rp.evaluate(buffer_expert)[0]
reward_policy = gail_rp.evaluate(buffer_policy)[0]
assert reward_expert >= 0 # GAIL / VAIL reward always positive
assert reward_policy >= 0
reward_expert = gail_rp.evaluate(buffer_expert)[0]
reward_policy = gail_rp.evaluate(buffer_policy)[0]
assert reward_expert > reward_policy # Expert reward greater than non-expert reward
#
#
#@pytest.mark.parametrize("seed", SEED)
#@pytest.mark.parametrize(
# "behavior_spec",
# [
# BehaviorSpec([(8,)], ActionSpec(2, tuple())),
# BehaviorSpec([(10,)], ActionSpec(0, (2, 3, 3, 3))),
# BehaviorSpec([(10,)], ActionSpec(0, (20,))),
# ],
#)
#@pytest.mark.parametrize("use_actions", [False, True])
#@patch(
# "mlagents.trainers.torch.components.reward_providers.gail_reward_provider.demo_to_buffer"
#)
#def test_reward_decreases_vail(
# demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int
#) -> None:
# np.random.seed(seed)
# torch.manual_seed(seed)
# buffer_expert = create_agent_buffer(behavior_spec, 1000)
# buffer_policy = create_agent_buffer(behavior_spec, 1000)
# demo_to_buffer.return_value = None, buffer_expert
# gail_settings = GAILSettings(
# demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions
# )
# DiscriminatorNetwork.initial_beta = 0.0
# # we must set the initial value of beta to 0 for testing
# # If we do not, the kl-loss will dominate early and will block the estimator
# gail_rp = create_reward_provider(
# RewardSignalType.GAIL, behavior_spec, gail_settings
# )
#
# for _ in range(200):
# gail_rp.update(buffer_policy)
# reward_expert = gail_rp.evaluate(buffer_expert)[0]
# reward_policy = gail_rp.evaluate(buffer_policy)[0]
# assert reward_expert >= 0 # GAIL / VAIL reward always positive
# assert reward_policy >= 0
# reward_expert = gail_rp.evaluate(buffer_expert)[0]
# reward_policy = gail_rp.evaluate(buffer_policy)[0]
# assert reward_expert > reward_policy # Expert reward greater than non-expert reward

5
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/utils.py


next_observations = [
np.random.normal(size=shape) for shape in behavior_spec.observation_shapes
]
action = behavior_spec.create_random_action(1)[0, :]
action_buffer = behavior_spec.action_spec.create_random_action(1)
#action = behavior_spec.action_spec.create_random_action(1)[0, :]
action = np.concatenate([action_buffer.continuous, action_buffer.discrete], axis=1)
print(action)
for _ in range(number):
curr_split_obs = SplitObservations.from_observations(curr_observations)
next_split_obs = SplitObservations.from_observations(next_observations)

2
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


specs.observation_shapes, state_encoder_settings
)
self._action_flattener = ModelUtils.ActionFlattener(specs)
self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
self.inverse_model_action_prediction = torch.nn.Sequential(
LinearEncoder(2 * settings.encoding_size, 1, 256),

2
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


vis_encode_type=EncoderType.SIMPLE,
memory=None,
)
self._action_flattener = ModelUtils.ActionFlattener(specs)
self._action_flattener = ModelUtils.ActionFlattener(specs.action_spec)
unencoded_size = (
self._action_flattener.flattened_size + 1 if settings.use_actions else 0
) # +1 is for dones

32
ml-agents/mlagents/trainers/torch/utils.py


)
from mlagents.trainers.settings import EncoderType, ScheduleType
from mlagents.trainers.exception import UnityTrainerException
from mlagents_envs.base_env import BehaviorSpec
from mlagents_envs.base_env import ActionSpec
from mlagents.trainers.torch.distributions import DistInstance, DiscreteDistInstance

}
class ActionFlattener:
def __init__(self, behavior_spec: BehaviorSpec):
self._specs = behavior_spec
def __init__(self, action_spec: ActionSpec):
self._specs = action_spec
if self._specs.is_action_continuous():
return self._specs.action_size
else:
return sum(self._specs.discrete_action_branches)
return self._specs.continuous_action_size + sum(self._specs.discrete_action_branches)
if self._specs.is_action_continuous():
return action
else:
return torch.cat(
_cont = action[: self._specs.continuous_action_size]
_disc = action[self._specs.continuous_action_size :]
_disc = torch.cat(
torch.as_tensor(action, dtype=torch.long),
torch.as_tensor(_disc, dtype=torch.long),
return torch.cat([_cont, _disc], dim=1)
#if self._specs.is_action_continuous():
# return action
#else:
# return torch.cat(
# ModelUtils.actions_to_onehot(
# torch.as_tensor(action, dtype=torch.long),
# self._specs.discrete_action_branches,
# ),
# dim=1,
# )
@staticmethod
def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:

正在加载...
取消
保存