浏览代码

[bug-fix] Fix crash when demo size is smaller than batch size (#3591)

/bug-failed-api-check
GitHub 4 年前
当前提交
ffd8f855
共有 7 个文件被更改,包括 82 次插入18 次删除
  1. 1
      com.unity.ml-agents/CHANGELOG.md
  2. 5
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  3. 5
      ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py
  4. 3
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  5. 16
      ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py
  6. 4
      ml-agents/mlagents/trainers/sac/optimizer.py
  7. 66
      ml-agents/mlagents/trainers/tests/test_ppo.py

1
com.unity.ml-agents/CHANGELOG.md


- `DecisionRequester` has been made internal (you can still use the DecisionRequesterComponent from the inspector). `RepeatAction` was renamed `TakeActionsBetweenDecisions` for clarity. (#3555)
- The `IFloatProperties` interface has been removed.
- Fix #3579.
- Fixed an issue when using GAIL with less than `batch_size` number of demonstrations. (#3591)
## [0.14.1-preview] - 2020-02-25

5
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


from mlagents.trainers.exception import UnityTrainerException
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
logger = logging.getLogger("mlagents.trainers")

self.strength = strength
self.stats_name_to_update_name: Dict[str, str] = {}
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
"""
Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
function drawn straight from a Buffer.

)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict

5
ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py


from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
from mlagents.trainers.policy.tf_policy import TFPolicy
from mlagents.trainers.buffer import AgentBuffer
class CuriosityRewardSignal(RewardSignal):

}
self.has_updated = False
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,

super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare for update and get feed_dict.

3
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


import numpy as np
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
from mlagents.trainers.buffer import AgentBuffer
class ExtrinsicRewardSignal(RewardSignal):

param_keys = ["strength", "gamma"]
super().check_config(config_dict, param_keys)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
return RewardSignalResult(self.strength * env_rews, env_rews)

16
ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py


from mlagents.trainers.policy.tf_policy import TFPolicy
from .model import GAILModel
from mlagents.trainers.demo_loader import demo_to_buffer
from mlagents.trainers.buffer import AgentBuffer
class GAILRewardSignal(RewardSignal):

"Policy/GAIL Expert Estimate": "gail_expert_estimate",
}
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
feed_dict: Dict[tf.Tensor, Any] = {
self.policy.batch_size_ph: len(mini_batch["actions"]),
self.policy.sequence_length_ph: self.policy.sequence_length,

super().check_config(config_dict, param_keys)
def prepare_update(
self, policy: TFPolicy, mini_batch: Dict[str, np.ndarray], num_sequences: int
self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
) -> Dict[tf.Tensor, Any]:
"""
Prepare inputs for update. .

"""
max_num_experiences = min(
len(mini_batch["actions"]), self.demonstration_buffer.num_experiences
)
# If num_sequences is less, we need to shorten the input batch.
for key, element in mini_batch.items():
mini_batch[key] = element[:max_num_experiences]
# Get batch from demo buffer
# Get batch from demo buffer. Even if demo buffer is smaller, we sample with replacement
len(mini_batch["actions"]), 1
mini_batch.num_experiences, 1
)
feed_dict: Dict[tf.Tensor, Any] = {

4
ml-agents/mlagents/trainers/sac/optimizer.py


return update_stats
def update_reward_signals(
self, reward_signal_minibatches: Mapping[str, Dict], num_sequences: int
self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
) -> Dict[str, float]:
"""
Only update the reward signals.

feed_dict: Dict[tf.Tensor, Any],
update_dict: Dict[str, tf.Tensor],
stats_needed: Dict[str, str],
reward_signal_minibatches: Mapping[str, Dict],
reward_signal_minibatches: Mapping[str, AgentBuffer],
num_sequences: int,
) -> None:
"""

66
ml-agents/mlagents/trainers/tests/test_ppo.py


from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.mock_brain import make_brain_parameters
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
from mlagents.trainers.tests.test_reward_signals import ( # noqa: F401; pylint: disable=unused-variable
curiosity_dummy_config,
gail_dummy_config,
)
@pytest.fixture

optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // dummy_config["sequence_length"],
)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_curiosity(
curiosity_dummy_config, dummy_config, rnn, visual, discrete # noqa: F811
):
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(curiosity_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# We need to test this separately from test_reward_signals.py to ensure no interactions
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
tf.reset_default_graph()
dummy_config["reward_signals"].update(gail_dummy_config)
optimizer = _create_ppo_optimizer_ops_mock(
dummy_config, use_rnn=False, use_discrete=False, use_visual=False
)
# Test update
update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)
# Check if buffer size is too big
update_buffer = mb.simulate_rollout(3000, optimizer.policy.brain)
# Mock out reward signal eval
update_buffer["advantages"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
update_buffer["gail_returns"] = update_buffer["environment_rewards"]
update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
optimizer.update(
update_buffer,
num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
)

正在加载...
取消
保存