浏览代码

Merge pull request #4878 from Unity-Technologies/reward-dist

Track histogram of environment reward
/bullet-hell-barracuda-test-1.3.1
GitHub 4 年前
当前提交
f027e12d
共有 7 个文件被更改,包括 48 次插入53 次删除
  1. 1
      com.unity.ml-agents/CHANGELOG.md
  2. 7
      com.unity.ml-agents/Runtime/StatsRecorder.cs
  3. 3
      ml-agents-envs/mlagents_envs/side_channel/stats_side_channel.py
  4. 32
      ml-agents/mlagents/trainers/stats.py
  5. 18
      ml-agents/mlagents/trainers/tests/test_agent_processor.py
  6. 34
      ml-agents/mlagents/trainers/tests/test_stats.py
  7. 6
      ml-agents/mlagents/trainers/trainer/rl_trainer.py

1
com.unity.ml-agents/CHANGELOG.md


default implementation will wrap `ActuatorComponent.CreateActuator` in an array and return that. (#4899)
#### ml-agents / ml-agents-envs / gym-unity (Python)
- Tensorboard now logs the Environment Reward as both a scalar and a histogram. (#4878)
- Added a `--torch-device` commandline option to `mlagents-learn`, which sets the default
[`torch.device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device) used for training. (#4888)
- The `--cpu` commandline option had no effect and was removed. Use `--torch-device=cpu` to force CPU training. (#4888)

7
com.unity.ml-agents/Runtime/StatsRecorder.cs


/// <summary>
/// Values within the summary period are summed up before reporting.
/// </summary>
Sum = 2
Sum = 2,
/// <summary>
/// Values within the summary period are reported as a histogram.
/// </summary>
Histogram = 3
}
/// <summary>

3
ml-agents-envs/mlagents_envs/side_channel/stats_side_channel.py


# Values within the summary period are summed up before reporting.
SUM = 2
# All values within a summary period are reported as a histogram.
HISTOGRAM = 3
StatList = List[Tuple[float, StatsAggregationMethod]]
EnvironmentStats = Mapping[str, StatList]

32
ml-agents/mlagents/trainers/stats.py


class StatsSummary(NamedTuple):
mean: float
std: float
num: int
sum: float
full_dist: List[float]
return StatsSummary(0.0, 0.0, 0, 0.0, StatsAggregationMethod.AVERAGE)
return StatsSummary([], StatsAggregationMethod.AVERAGE)
@property
def aggregated_value(self):

return self.mean
@property
def mean(self):
return np.mean(self.full_dist)
@property
def std(self):
return np.std(self.full_dist)
@property
def num(self):
return len(self.full_dist)
@property
def sum(self):
return np.sum(self.full_dist)
class StatsPropertyType(Enum):
HYPERPARAMETERS = "hyperparameters"

self.summary_writers[category].add_scalar(
f"{key}", value.aggregated_value, step
)
if value.aggregation_method == StatsAggregationMethod.HISTOGRAM:
self.summary_writers[category].add_histogram(
f"{key}_hist", np.array(value.full_dist), step
)
self.summary_writers[category].flush()
def _maybe_create_summary_writer(self, category: str) -> None:

return StatsSummary.empty()
return StatsSummary(
mean=np.mean(stat_values),
std=np.std(stat_values),
num=len(stat_values),
sum=np.sum(stat_values),
full_dist=stat_values,
aggregation_method=StatsReporter.stats_aggregation[self.category][key],
)

18
ml-agents/mlagents/trainers/tests/test_agent_processor.py


expected_stats = {
"averaged": StatsSummary(
mean=2.0,
std=mock.ANY,
num=2,
sum=4.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[1.0, 3.0], aggregation_method=StatsAggregationMethod.AVERAGE
mean=4.0,
std=0.0,
num=1,
sum=4.0,
aggregation_method=StatsAggregationMethod.MOST_RECENT,
full_dist=[4.0], aggregation_method=StatsAggregationMethod.MOST_RECENT
mean=2.1,
std=mock.ANY,
num=2,
sum=4.2,
aggregation_method=StatsAggregationMethod.SUM,
full_dist=[3.1, 1.1], aggregation_method=StatsAggregationMethod.SUM
),
}
stats_reporter.write_stats(123)

34
ml-agents/mlagents/trainers/tests/test_stats.py


with tempfile.TemporaryDirectory(prefix="unittest-") as base_dir:
tb_writer = TensorboardWriter(base_dir, clear_past_data=False)
statssummary1 = StatsSummary(
mean=1.0,
std=1.0,
num=1,
sum=1.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[1.0], aggregation_method=StatsAggregationMethod.AVERAGE
)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)

def test_tensorboard_writer_clear(tmp_path):
tb_writer = TensorboardWriter(tmp_path, clear_past_data=False)
statssummary1 = StatsSummary(
mean=1.0,
std=1.0,
num=1,
sum=1.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[1.0], aggregation_method=StatsAggregationMethod.AVERAGE
)
tb_writer.write_stats("category1", {"key1": statssummary1}, 10)
# TB has some sort of timeout before making a new file

category = "category1"
console_writer = ConsoleWriter()
statssummary1 = StatsSummary(
mean=1.0,
std=1.0,
num=1,
sum=1.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[1.0], aggregation_method=StatsAggregationMethod.AVERAGE
)
console_writer.write_stats(
category,

10,
)
statssummary2 = StatsSummary(
mean=0.0,
std=0.0,
num=1,
sum=0.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[0.0], aggregation_method=StatsAggregationMethod.AVERAGE
)
console_writer.write_stats(
category,

)
self.assertIn(
"Mean Reward: 1.000. Std of Reward: 1.000. Training.", cm.output[0]
"Mean Reward: 1.000. Std of Reward: 0.000. Training.", cm.output[0]
)
self.assertIn("Not Training.", cm.output[1])

console_writer = ConsoleWriter()
console_writer.add_property(category, StatsPropertyType.SELF_PLAY, True)
statssummary1 = StatsSummary(
mean=1.0,
std=1.0,
num=1,
sum=1.0,
aggregation_method=StatsAggregationMethod.AVERAGE,
full_dist=[1.0], aggregation_method=StatsAggregationMethod.AVERAGE
)
console_writer.write_stats(
category,

)
self.assertIn(
"Mean Reward: 1.000. Std of Reward: 1.000. Training.", cm.output[0]
"Mean Reward: 1.000. Std of Reward: 0.000. Training.", cm.output[0]
)

6
ml-agents/mlagents/trainers/trainer/rl_trainer.py


import abc
import time
import attr
from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
from mlagents.trainers.policy.checkpoint_manager import (
ModelCheckpoint,
ModelCheckpointManager,

for name, rewards in self.collected_rewards.items():
if name == "environment":
self.stats_reporter.add_stat(
"Environment/Cumulative Reward", rewards.get(agent_id, 0)
"Environment/Cumulative Reward",
rewards.get(agent_id, 0),
aggregation=StatsAggregationMethod.HISTOGRAM,
)
self.cumulative_returns_since_policy_update.append(
rewards.get(agent_id, 0)

正在加载...
取消
保存