浏览代码

Slightly closer to running model

/develop/add-fire
Arthur Juliani 5 年前
当前提交
947f0d32
共有 4 个文件被更改,包括 60 次插入51 次删除
  1. 2
      ml-agents/mlagents/trainers/models_torch.py
  2. 68
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  3. 1
      ml-agents/mlagents/trainers/policy/torch_policy.py
  4. 40
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py

2
ml-agents/mlagents/trainers/models_torch.py


class Normalizer(nn.Module):
def __init__(self, vec_obs_size, **kwargs):
super(Normalizer, self).__init__(**kwargs)
print(vec_obs_size)
self.normalization_steps = torch.tensor(1)
self.running_mean = torch.zeros(vec_obs_size)
self.running_variance = torch.ones(vec_obs_size)

for _ in range(num_layers - 1):
self.layers.append(nn.Linear(hidden_size, hidden_size))
self.layers.append(nn.ReLU())
print(self.layers)
def forward(self, inputs):
x = inputs

68
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


from typing import Dict, Any, Optional
from typing import Dict, Any, Optional, Tuple, List
import numpy as np
from mlagents_envs.base_env import DecisionSteps
from mlagents.trainers.components.reward_signals.extrinsic.signal import (
ExtrinsicRewardSignal,
)
from mlagents.trainers.components.reward_signals.reward_signal_factory import (
create_reward_signal,
)
from mlagents.trainers.trajectory import SplitObservations
class TorchOptimizer(Optimizer): # pylint: disable=W0223

Create reward signals
:param reward_signal_configs: Reward signal config.
"""
self.reward_signals = {}
extrinsic_signal = ExtrinsicRewardSignal(
self.policy, **reward_signal_configs["extrinsic"]
)
self.reward_signals = {"extrinsic": extrinsic_signal}
for reward_signal, config in reward_signal_configs.items():
self.reward_signals[reward_signal] = create_reward_signal(
self.policy, reward_signal, config
)
self.update_dict.update(self.reward_signals[reward_signal].update_dict)
# for reward_signal, config in reward_signal_configs.items():
# self.reward_signals[reward_signal] = create_reward_signal(
# self.policy, reward_signal, config
# )
# self.update_dict.update(self.reward_signals[reward_signal].update_dict)
def get_value_estimates(
self, decision_requests: DecisionSteps, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param decision_requests:
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode,
in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal
and the value the corresponding value estimate.
"""
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
value_estimates, mean_value = self.policy.critic(
np.expand_dims(vec_vis_obs.vector_observations[idx], 0),
np.expand_dims(vec_vis_obs.visual_observations[idx], 0),
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
vector_obs = torch.Tensor(np.array(batch["vector_obs"]))
visual_obs = batch["visual_obs"]
value_estimates, mean_value = self.policy.critic(vector_obs, visual_obs)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach()
return value_estimates, value_estimates

1
ml-agents/mlagents/trainers/policy/torch_policy.py


If this policy normalizes vector observations, this will update the norm values in the graph.
:param vector_obs: The vector observations to add to the running estimate of the distribution.
"""
vector_obs = np.array(vector_obs)
print(vector_obs.shape)
vector_obs = [vector_obs]
if self.use_vec_obs and self.normalize:

40
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


from typing import Any, Dict, List, Tuple
from typing import Any, Dict
from mlagents.trainers.trajectory import SplitObservations
from mlagents_envs.base_env import DecisionSteps
from mlagents_envs.timers import timed
from mlagents.trainers.policy.torch_policy import TorchPolicy

self.optimizer = torch.optim.Adam(
params, lr=self.trainer_params["learning_rate"]
)
reward_signal_configs = trainer_params["reward_signals"]
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",

self.create_reward_signals(reward_signal_configs)
def ppo_value_loss(self, values, old_values, returns):
"""

}
return update_stats
def get_value_estimates(
self, decision_requests: DecisionSteps, idx: int, done: bool
) -> Dict[str, float]:
"""
Generates value estimates for bootstrapping.
:param decision_requests:
:param idx: Index in BrainInfo of agent.
:param done: Whether or not this is the last element of the episode,
in which case the value estimate will be 0.
:return: The value estimate dictionary with key being the name of the reward signal
and the value the corresponding value estimate.
"""
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
value_estimates = self.policy.critic(
np.expand_dims(vec_vis_obs.vector_observations[idx], 0),
np.expand_dims(vec_vis_obs.visual_observations[idx], 0),
)
value_estimates = {k: float(v) for k, v in value_estimates.items()}
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
value_estimates[k] = 0.0
return value_estimates
def get_trajectory_value_estimates(
self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
return ({"": np.zeros(0)}), {"": 0.0}
正在加载...
取消
保存