浏览代码

Combine actor and critic classes. Initial export.

/develop/add-fire
Arthur Juliani 4 年前
当前提交
1736559f
共有 7 个文件被更改,包括 104 次插入47 次删除
  1. 40
      ml-agents/mlagents/trainers/models_torch.py
  2. 14
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  3. 12
      ml-agents/mlagents/trainers/policy/policy.py
  4. 8
      ml-agents/mlagents/trainers/policy/tf_policy.py
  5. 59
      ml-agents/mlagents/trainers/policy/torch_policy.py
  6. 11
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  7. 7
      ml-agents/mlagents/trainers/trainer/trainer.py

40
ml-agents/mlagents/trainers/models_torch.py


if self.use_lstm:
embedding = embedding.reshape([sequence_length, -1, self.h_size])
print(embedding.shape, memories.shape)
class Actor(nn.Module):
class ActorCritic(nn.Module):
def __init__(
self,
h_size,

vis_encode_type,
act_type,
use_lstm,
stream_names,
separate_critic,
super(Actor, self).__init__()
super(ActorCritic, self).__init__()
self.separate_critic = separate_critic
self.network_body = NetworkBody(
vector_sizes,
visual_sizes,

self.distribution = GaussianDistribution(h_size, act_size[0])
else:
self.distribution = MultiCategoricalDistribution(h_size, act_size)
if separate_critic:
self.critic = Critic(
stream_names,
h_size,
vector_sizes,
visual_sizes,
normalize,
num_layers,
m_size,
vis_encode_type,
)
else:
self.stream_names = stream_names
self.value_heads = ValueHeads(stream_names, h_size)
def update_normalization(self, vector_obs):
self.network_body.update_normalization(vector_obs)
if self.separate_critic:
self.critic.network_body.update_normalization(vector_obs)
def critic_pass(self, vec_inputs, vis_inputs):
if self.separate_critic:
return self.critic(vec_inputs, vis_inputs)
else:
embedding, _ = self.network_body(vec_inputs, vis_inputs)
return self.value_heads(embedding)
def forward(
self, vec_inputs, vis_inputs, masks=None, memories=None, sequence_length=1

dist = self.distribution(embedding)
else:
dist = self.distribution(embedding, masks=masks)
return dist, memories
if self.separate_critic:
value_outputs = self.critic(vec_inputs, vis_inputs)
else:
value_outputs = self.value_heads(embedding)
return dist, value_outputs, memories
class Critic(nn.Module):

14
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


"""
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
value_estimates, mean_value = self.policy.critic(
value_estimates, mean_value = self.policy.actor_critic.critic_pass(
np.expand_dims(vec_vis_obs.vector_observations[idx], 0),
np.expand_dims(vec_vis_obs.visual_observations[idx], 0),
)

vector_obs = [torch.Tensor(np.array(batch["vector_obs"]))]
if self.policy.use_vis_obs:
visual_obs = []
for idx, _ in enumerate(self.policy.actor.network_body.visual_encoders):
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
visual_ob = torch.Tensor(np.array(batch["visual_obs%d" % idx]))
visual_obs.append(visual_ob)
else:

next_obs = [torch.Tensor(next_obs).unsqueeze(0)]
value_estimates, mean_value = self.policy.critic(vector_obs, visual_obs)
value_estimates, mean_value = self.policy.actor_critic.critic_pass(
vector_obs, visual_obs
)
next_value_estimate, next_value = self.policy.critic(next_obs, next_obs)
next_value_estimate, next_value = self.policy.actor_critic.critic_pass(
next_obs, next_obs
)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach().numpy()

12
ml-agents/mlagents/trainers/policy/policy.py


raise NotImplementedError
@abstractmethod
def increment_step(self, n_steps):
def export_model(self, step=0):
def save_model(self, step):
def save_model(self, step=0):
pass
@abstractmethod
def load_model(self, step=0):
pass
@abstractmethod
def increment_step(self, n_steps):
pass

8
ml-agents/mlagents/trainers/policy/tf_policy.py


from typing import Any, Dict, List, Optional
import abc
import numpy as np
from mlagents.model_serialization import SerializationSettings, export_policy_model
from mlagents.tf_utils import tf
from mlagents import tf_utils
from mlagents_envs.logging_util import get_logger

"""
raise UnityPolicyException("The evaluate function was not implemented.")
def export_model(self, step=0):
settings = SerializationSettings(self.model_path, self.brain.brain_name)
export_policy_model(settings, self.graph, self.sess)
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0
) -> ActionInfo:

"""
return list(self.update_dict.keys())
def save_model(self, step):
def save_model(self, step=0):
"""
Saves the model
:param step: The number of steps the model was trained for

59
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List
import numpy as np
import torch
from torch import onnx
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.brain_conversion_utils import get_global_agent_id

from mlagents.trainers.policy.policy import UnityPolicyException
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.brain import BrainParameters
from mlagents.trainers.models_torch import EncoderType, Actor, Critic
from mlagents.trainers.models_torch import EncoderType, ActorCritic
EPSILON = 1e-7 # Small value to avoid divide by zero

"Losses/Policy Loss": "policy_loss",
}
self.actor = Actor(
self.actor_critic = ActorCritic(
h_size=int(trainer_params["hidden_units"]),
act_type=self.act_type,
vector_sizes=[brain.vector_observation_space_size],

vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
)
self.critic = Critic(
h_size=int(trainer_params["hidden_units"]),
vector_sizes=[brain.vector_observation_space_size],
normalize=trainer_params["normalize"],
num_layers=int(trainer_params["num_layers"]),
m_size=trainer_params["memory_size"],
use_lstm=self.use_recurrent,
visual_sizes=brain.camera_resolutions,
vis_encode_type=EncoderType(
trainer_params.get("vis_encode_type", "simple")
),
separate_critic=self.use_continuous_act,
)
def split_decision_step(self, decision_requests):

vector_obs = torch.Tensor(vector_obs)
vector_obs = [vector_obs]
if self.use_vec_obs and self.normalize:
self.critic.network_body.update_normalization(vector_obs)
self.actor.network_body.update_normalization(vector_obs)
self.actor_critic.update_normalization(vector_obs)
action_dists, new_memories = self.actor(
action_dists, (value_heads, mean_value), new_memories = self.actor_critic(
vec_obs, vis_obs, masks, memories, seq_len
)
if actions is None:

actions = actions.squeeze(-1)
log_probs = log_probs.squeeze(-1)
entropies = entropies.squeeze(-1)
value_heads, mean_value = self.critic(vec_obs, vis_obs)
return actions, log_probs, entropies, value_heads, memories
@timed

run_out["learning_rate"] = 0.0
if self.use_recurrent:
run_out["memories"] = np.array(memories.detach())
self.actor.network_body.update_normalization(vec_obs)
self.critic.network_body.update_normalization(vec_obs)
self.actor_critic.update_normalization(vec_obs)
return run_out
def get_action(

agent_ids=list(decision_requests.agent_id),
)
def save_model(self, step=0):
"""
Saves the model
:param step: The number of steps the model was trained for
"""
save_path = self.model_path + "/model-" + str(step) + ".pt"
torch.save(self.actor_critic.state_dict(), save_path)
def load_model(self, step=0):
load_path = self.model_path + "/model-" + str(step) + ".pt"
self.actor_critic.load_state_dict(torch.load(load_path))
def export_model(self, step=0):
fake_vec_obs = [torch.zeros(self.vec_obs_size)]
fake_vis_obs = [torch.zeros(camera_res) for camera_res in self.vis_obs_size]
export_path = self.model_path + "/model-" + str(step) + ".onnx"
output_names = ["action", "memories", "value_estimates"]
onnx.export(
self.actor_critic,
(fake_vec_obs, fake_vis_obs),
export_path,
verbose=True,
output_names=output_names,
)
@property
def vis_obs_size(self):
return self.brain.number_visual_observations

"""
self.global_step += n_steps
return self.get_current_step()
def save_model(self, step):
pass
def export_model(self):
pass

11
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


# Create the graph here to give more granular control of the TF graph to the Optimizer.
super(PPOOptimizer, self).__init__(policy, trainer_params)
params = list(self.policy.actor.parameters()) + list(
self.policy.critic.parameters()
)
params = list(self.policy.actor_critic.parameters())
self.optimizer = torch.optim.Adam(
params, lr=self.trainer_params["learning_rate"]

torch.Tensor(np.array(batch["memory"][i]))
for i in range(0, len(batch["memory"]), self.policy.sequence_length)
]
memories = torch.stack(memories).unsqueeze(0)
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
for idx, _ in enumerate(self.policy.actor.network_body.visual_encoders):
for idx, _ in enumerate(
self.policy.actor_critic.network_body.visual_encoders
):
vis_ob = torch.Tensor(np.array(batch["visual_obs%d" % idx]))
vis_obs.append(vis_ob)
else:

7
ml-agents/mlagents/trainers/trainer/trainer.py


from collections import deque
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.stats import StatsReporter
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.agent_processor import AgentManagerQueue

"""
Exports the model
"""
print("Export")
# policy = self.get_policy(name_behavior_id)
# settings = SerializationSettings(policy.model_path, policy.brain.brain_name)
# export_policy_model(settings, policy.graph, policy.sess)
policy = self.get_policy(name_behavior_id)
policy.export_model()
@abc.abstractmethod
def end_episode(self):

正在加载...
取消
保存