浏览代码

Stats reporting is working

/develop-newnormalization
Ervin Teng 5 年前
当前提交
9c5fdd31
共有 5 个文件被更改,包括 14 次插入167 次删除
  1. 1
      ml-agents/mlagents/trainers/components/bc/module.py
  2. 166
      ml-agents/mlagents/trainers/ppo/trainer.py
  3. 3
      ml-agents/mlagents/trainers/rl_trainer.py
  4. 2
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 9
      ml-agents/mlagents/trainers/trainer_controller.py

1
ml-agents/mlagents/trainers/components/bc/module.py


feed_dict[self.policy.model.prev_action] = mini_batch_demo[
"prev_action"
]
network_out = self.policy.sess.run(
list(self.out_dict.values()), feed_dict=feed_dict
)

166
ml-agents/mlagents/trainers/ppo/trainer.py


import logging
from collections import defaultdict
from typing import Dict
from mlagents.envs.brain import BrainInfo
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput
from mlagents.trainers.rl_trainer import RLTrainer
from mlagents.envs.action_info import ActionInfoOutputs
logger = logging.getLogger("mlagents.trainers")

)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
def process_trajectory(self, trajectory: Trajectory) -> None:
"""

value_next = self.policy.get_value_estimates2(trajectory.next_step)
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
evaluate_result = reward_signal.evaluate_batch(agent_buffer_trajectory)
agent_buffer_trajectory["{}_rewards".format(name)].extend(
evaluate_result.scaled_reward
)
self.collected_rewards[name][agent_id] += np.sum(
evaluate_result.unscaled_reward
)
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
agent_buffer_trajectory["{}_rewards".format(name)].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
# Compute GAE and returns
tmp_advantages = []

rewards.get(agent_id, 0)
)
rewards[agent_id] = 0
def process_experiences(
self, current_info: BrainInfo, next_info: BrainInfo
) -> None:
pass
# """
# Checks agent histories for processing condition, and processes them as necessary.
# Processing involves calculating value and advantage targets for model updating step.
# :param current_info: current BrainInfo.
# :param next_info: next BrainInfo.
# """
# if self.is_training:
# self.policy.update_normalization(next_info.vector_observations)
# for l in range(len(next_info.agents)):
# agent_actions = self.processing_buffer[next_info.agents[l]]["actions"]
# if (
# next_info.local_done[l]
# or len(agent_actions) > self.trainer_parameters["time_horizon"]
# ) and len(agent_actions) > 0:
# agent_id = next_info.agents[l]
# if next_info.max_reached[l]:
# bootstrapping_info = self.processing_buffer[
# agent_id
# ].last_brain_info
# idx = bootstrapping_info.agents.index(agent_id)
# else:
# bootstrapping_info = next_info
# idx = l
# value_next = self.policy.get_value_estimates(
# bootstrapping_info,
# idx,
# next_info.local_done[l] and not next_info.max_reached[l],
# )
# tmp_advantages = []
# tmp_returns = []
# for name in self.policy.reward_signals:
# bootstrap_value = value_next[name]
# local_rewards = self.processing_buffer[agent_id][
# "{}_rewards".format(name)
# ].get_batch()
# local_value_estimates = self.processing_buffer[agent_id][
# "{}_value_estimates".format(name)
# ].get_batch()
# local_advantage = get_gae(
# rewards=local_rewards,
# value_estimates=local_value_estimates,
# value_next=bootstrap_value,
# gamma=self.policy.reward_signals[name].gamma,
# lambd=self.trainer_parameters["lambd"],
# )
# local_return = local_advantage + local_value_estimates
# # This is later use as target for the different value estimates
# self.processing_buffer[agent_id]["{}_returns".format(name)].set(
# local_return
# )
# self.processing_buffer[agent_id]["{}_advantage".format(name)].set(
# local_advantage
# )
# tmp_advantages.append(local_advantage)
# tmp_returns.append(local_return)
# global_advantages = list(
# np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
# )
# global_returns = list(
# np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)
# )
# self.processing_buffer[agent_id]["advantages"].set(global_advantages)
# self.processing_buffer[agent_id]["discounted_returns"].set(
# global_returns
# )
# self.processing_buffer.append_to_update_buffer(
# self.update_buffer,
# agent_id,
# batch_size=None,
# training_length=self.policy.sequence_length,
# )
# self.processing_buffer[agent_id].reset_agent()
# if next_info.local_done[l]:
# self.stats["Environment/Episode Length"].append(
# self.episode_steps.get(agent_id, 0)
# )
# self.episode_steps[agent_id] = 0
# for name, rewards in self.collected_rewards.items():
# if name == "environment":
# self.cumulative_returns_since_policy_update.append(
# rewards.get(agent_id, 0)
# )
# self.stats["Environment/Cumulative Reward"].append(
# rewards.get(agent_id, 0)
# )
# self.reward_buffer.appendleft(rewards.get(agent_id, 0))
# rewards[agent_id] = 0
# else:
# self.stats[
# self.policy.reward_signals[name].stat_name
# ].append(rewards.get(agent_id, 0))
# rewards[agent_id] = 0
def add_policy_outputs(
self, take_action_outputs: ActionInfoOutputs, agent_id: str, agent_idx: int
) -> None:
"""
Takes the output of the last action and store it into the training buffer.
"""
actions = take_action_outputs["action"]
if self.policy.use_continuous_act:
actions_pre = take_action_outputs["pre_action"]
self.processing_buffer[agent_id]["actions_pre"].append(
actions_pre[agent_idx]
)
epsilons = take_action_outputs["random_normal_epsilon"]
self.processing_buffer[agent_id]["random_normal_epsilon"].append(
epsilons[agent_idx]
)
a_dist = take_action_outputs["log_probs"]
# value is a dictionary from name of reward to value estimate of the value head
self.processing_buffer[agent_id]["actions"].append(actions[agent_idx])
self.processing_buffer[agent_id]["action_probs"].append(a_dist[agent_idx])
def add_rewards_outputs(
self,
rewards_out: AllRewardsOutput,
values: Dict[str, np.ndarray],
agent_id: str,
agent_idx: int,
agent_next_idx: int,
) -> None:
"""
Takes the value output of the last action and store it into the training buffer.
"""
for name, reward_result in rewards_out.reward_signals.items():
# 0 because we use the scaled reward to train the agent
self.processing_buffer[agent_id]["{}_rewards".format(name)].append(
reward_result.scaled_reward[agent_next_idx]
)
self.processing_buffer[agent_id]["{}_value_estimates".format(name)].append(
values[name][agent_idx][0]
)
def is_ready_update(self):
"""

3
ml-agents/mlagents/trainers/rl_trainer.py


# # Unity ML-Agents Toolkit
import logging
from typing import Dict, List, Any, NamedTuple
from collections import defaultdict
import numpy as np
from mlagents.envs.brain import BrainInfo

# collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
# used for reporting only. We always want to report the environment reward to Tensorboard, regardless
# of what reward signals are actually present.
self.collected_rewards = {"environment": {}}
self.collected_rewards = {"environment": defaultdict(lambda: 0)}
self.processing_buffer = ProcessingBuffer()
self.update_buffer = AgentBuffer()
self.episode_steps = {}

2
ml-agents/mlagents/trainers/sac/trainer.py


)
for _reward_signal in self.policy.reward_signals.keys():
self.collected_rewards[_reward_signal] = {}
self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
self.episode_steps = {}

9
ml-agents/mlagents/trainers/trainer_controller.py


step_info.current_all_brain_info[brain_name],
step_info.brain_name_to_action_info[brain_name].outputs,
)
# trainer.add_experiences(
# step_info.previous_all_brain_info[brain_name],
# step_info.current_all_brain_info[brain_name],
# step_info.brain_name_to_action_info[brain_name].outputs,
# )
# trainer.process_experiences(
# step_info.previous_all_brain_info[brain_name],
# step_info.current_all_brain_info[brain_name],
# )
for brain_name, trainer in self.trainers.items():
if brain_name in self.trainer_metrics:
self.trainer_metrics[brain_name].add_delta_step(delta_time_step)

正在加载...
取消
保存