|
|
|
|
|
|
import numpy as np |
|
|
|
from typing import Any, Dict # , Optional |
|
|
|
import tensorflow as tf |
|
|
|
import tensorflow_probability as tfp |
|
|
|
|
|
|
|
# import tensorflow_probability as tfp |
|
|
|
import torch |
|
|
|
import torch.nn as nn |
|
|
|
|
|
|
|
from mlagents.envs.timers import timed |
|
|
|
from mlagents.envs.brain import BrainInfo, BrainParameters |
|
|
|
|
|
|
logger = logging.getLogger("mlagents.trainers") |
|
|
|
|
|
|
|
|
|
|
|
class VectorEncoder(tf.keras.layers.Layer): |
|
|
|
def __init__(self, hidden_size, num_layers, **kwargs): |
|
|
|
class VectorEncoder(nn.Module): |
|
|
|
def __init__(self, input_size, hidden_size, num_layers, **kwargs): |
|
|
|
self.layers = [] |
|
|
|
for i in range(num_layers): |
|
|
|
self.layers.append(tf.keras.layers.Dense(hidden_size)) |
|
|
|
self.layers = [nn.Linear(input_size, hidden_size)] |
|
|
|
for i in range(num_layers - 1): |
|
|
|
self.layers.append(nn.Linear(hidden_size, hidden_size)) |
|
|
|
self.layers.append(nn.ReLU()) |
|
|
|
print(self.layers) |
|
|
|
def call(self, inputs): |
|
|
|
def forward(self, inputs): |
|
|
|
x = inputs |
|
|
|
for layer in self.layers: |
|
|
|
x = layer(x) |
|
|
|
|
|
|
class Critic(tf.keras.layers.Layer): |
|
|
|
def __init__(self, stream_names, encoder, **kwargs): |
|
|
|
class Critic(nn.Module): |
|
|
|
def __init__(self, stream_names, hidden_size, encoder, **kwargs): |
|
|
|
super(Critic, self).__init__(**kwargs) |
|
|
|
self.stream_names = stream_names |
|
|
|
self.encoder = encoder |
|
|
|
|
|
|
value = tf.keras.layers.Dense(1, name="{}_value".format(name)) |
|
|
|
value = nn.Linear(hidden_size, 1) |
|
|
|
def call(self, inputs): |
|
|
|
def forward(self, inputs): |
|
|
|
hidden = self.encoder(inputs) |
|
|
|
value_outputs = {} |
|
|
|
for stream_name, value in self.value_heads.items(): |
|
|
|
|
|
|
|
|
|
|
class GaussianDistribution(tf.keras.layers.Layer): |
|
|
|
def __init__(self, num_outputs, **kwargs): |
|
|
|
class GaussianDistribution(nn.Module): |
|
|
|
def __init__(self, hidden_size, num_outputs, **kwargs): |
|
|
|
self.mu = tf.keras.layers.Dense( |
|
|
|
num_outputs, |
|
|
|
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01), |
|
|
|
) |
|
|
|
self.log_sigma_sq = tf.keras.layers.Dense( |
|
|
|
num_outputs, |
|
|
|
kernel_initializer=tf.keras.initializers.VarianceScaling(scale=0.01), |
|
|
|
) |
|
|
|
# self.log_sigma_sq = tf.Variable( |
|
|
|
# name="log_sig_sq", dtype=tf.float32, initial_value=tf.zeros([num_outputs]), trainable=True |
|
|
|
# ) |
|
|
|
self.mu = nn.Linear(hidden_size, num_outputs) |
|
|
|
self.log_sigma_sq = nn.Linear(hidden_size, num_outputs) |
|
|
|
nn.init.xavier_uniform(self.mu.weight, gain=0.01) |
|
|
|
nn.init.xavier_uniform(self.log_sigma_sq.weight, gain=0.01) |
|
|
|
def call(self, inputs): |
|
|
|
def forward(self, inputs): |
|
|
|
return tfp.distributions.Normal(loc=mu, scale=tf.sqrt(tf.exp(log_sig))) |
|
|
|
return torch.distributions.normal.Normal( |
|
|
|
loc=mu, scale=torch.sqrt(torch.exp(log_sig)) |
|
|
|
) |
|
|
|
class Normalizer(tf.keras.layers.Layer): |
|
|
|
class Normalizer(nn.Module): |
|
|
|
self.normalization_steps = tf.Variable( |
|
|
|
name="normalization_steps", trainable=False, dtype=tf.int32, initial_value=1 |
|
|
|
) |
|
|
|
self.running_mean = tf.Variable( |
|
|
|
name="running_mean", |
|
|
|
shape=[vec_obs_size], |
|
|
|
trainable=False, |
|
|
|
dtype=tf.float32, |
|
|
|
initial_value=tf.zeros([vec_obs_size]), |
|
|
|
) |
|
|
|
self.running_variance = tf.Variable( |
|
|
|
name="running_variance", |
|
|
|
shape=[vec_obs_size], |
|
|
|
trainable=False, |
|
|
|
dtype=tf.float32, |
|
|
|
initial_value=tf.ones([vec_obs_size]), |
|
|
|
) |
|
|
|
self.normalization_steps = torch.tensor(1) |
|
|
|
self.running_mean = torch.zeros(vec_obs_size) |
|
|
|
self.running_variance = torch.ones(vec_obs_size) |
|
|
|
def call(self, inputs): |
|
|
|
normalized_state = tf.clip_by_value( |
|
|
|
def forward(self, inputs): |
|
|
|
inputs = torch.from_numpy(inputs) |
|
|
|
normalized_state = torch.clamp( |
|
|
|
/ tf.sqrt( |
|
|
|
self.running_variance |
|
|
|
/ (tf.cast(self.normalization_steps, tf.float32) + 1) |
|
|
|
/ torch.sqrt( |
|
|
|
self.running_variance / self.normalization_steps.type(torch.float32) |
|
|
|
name="normalized_state", |
|
|
|
mean_current_observation = tf.cast( |
|
|
|
tf.reduce_mean(vector_input, axis=0), tf.float32 |
|
|
|
) |
|
|
|
vector_input = torch.from_numpy(vector_input) |
|
|
|
mean_current_observation = vector_input.mean(0).type(torch.float32) |
|
|
|
) / tf.cast(tf.add(self.normalization_steps, 1), tf.float32) |
|
|
|
) / (self.normalization_steps + 1).type(torch.float32) |
|
|
|
self.running_mean.assign(new_mean) |
|
|
|
self.running_variance.assign(new_variance) |
|
|
|
self.normalization_steps.assign(self.normalization_steps + 1) |
|
|
|
self.running_mean = new_mean |
|
|
|
self.running_variance = new_variance |
|
|
|
self.normalization_steps = self.normalization_steps + 1 |
|
|
|
class ActorCriticPolicy(tf.keras.Model): |
|
|
|
class ActorCriticPolicy(nn.Module): |
|
|
|
input_size, |
|
|
|
act_size, |
|
|
|
normalize, |
|
|
|
num_layers, |
|
|
|
|
|
|
): |
|
|
|
super(ActorCriticPolicy, self).__init__() |
|
|
|
self.encoder = VectorEncoder(h_size, num_layers) |
|
|
|
self.distribution = GaussianDistribution(act_size) |
|
|
|
self.critic = Critic(stream_names, VectorEncoder(h_size, num_layers)) |
|
|
|
self.encoder = VectorEncoder(input_size, h_size, num_layers) |
|
|
|
self.distribution = GaussianDistribution(h_size, act_size) |
|
|
|
self.critic = Critic( |
|
|
|
stream_names, h_size, VectorEncoder(input_size, h_size, num_layers) |
|
|
|
) |
|
|
|
self.normalizer = None |
|
|
|
self.normalizer = Normalizer(input_size) |
|
|
|
def build(self, input_size): |
|
|
|
self.normalizer = Normalizer(input_size[1]) |
|
|
|
|
|
|
|
def call(self, inputs): |
|
|
|
def forward(self, inputs): |
|
|
|
if self.normalize: |
|
|
|
inputs = self.normalizer(inputs) |
|
|
|
_hidden = self.encoder(inputs) |
|
|
|
|
|
|
# entropy = dist.entropy() |
|
|
|
return dist |
|
|
|
|
|
|
|
@tf.function |
|
|
|
def update_normalization(self, inputs): |
|
|
|
if self.normalize: |
|
|
|
self.normalizer.update(inputs) |
|
|
|
|
|
|
:param load: Whether a pre-trained model will be loaded or a new one created. |
|
|
|
""" |
|
|
|
# super().__init__(seed, brain, trainer_params) |
|
|
|
self.inference_dict: Dict[str, tf.Tensor] = {} |
|
|
|
self.update_dict: Dict[str, tf.Tensor] = {} |
|
|
|
# TF defaults to 32-bit, so we use the same here. |
|
|
|
torch.set_default_tensor_type(torch.DoubleTensor) |
|
|
|
self.inference_dict: Dict[str, tf.Tensor] = {} |
|
|
|
self.update_dict: Dict[str, tf.Tensor] = {} |
|
|
|
self.stats_name_to_update_name = { |
|
|
|
"Losses/Value Loss": "value_loss", |
|
|
|
"Losses/Policy Loss": "policy_loss", |
|
|
|
|
|
|
) |
|
|
|
self.brain = brain |
|
|
|
self.trainer_params = trainer_params |
|
|
|
self.optimizer = tf.keras.optimizers.Adam( |
|
|
|
lr=self.trainer_params["learning_rate"] |
|
|
|
self.optimizer = torch.optim.Adam( |
|
|
|
self.model.parameters(), lr=self.trainer_params["learning_rate"] |
|
|
|
) |
|
|
|
self.sequence_length = ( |
|
|
|
1 |
|
|
|
|
|
|
self.global_step = tf.Variable(0) |
|
|
|
self.global_step = torch.tensor(0) |
|
|
|
self.create_reward_signals(reward_signal_configs) |
|
|
|
|
|
|
|
def create_model( |
|
|
|
|
|
|
""" |
|
|
|
self.model = ActorCriticPolicy( |
|
|
|
h_size=int(trainer_params["hidden_units"]), |
|
|
|
input_size=brain.vector_observation_space_size, |
|
|
|
act_size=sum(brain.vector_action_space_size), |
|
|
|
normalize=trainer_params["normalize"], |
|
|
|
num_layers=int(trainer_params["num_layers"]), |
|
|
|
|
|
|
|
|
|
|
value_losses = [] |
|
|
|
for name, head in values.items(): |
|
|
|
clipped_value_estimate = old_values[name] + tf.clip_by_value( |
|
|
|
tf.reduce_sum(head, axis=1) - old_values[name], |
|
|
|
-decay_epsilon, |
|
|
|
decay_epsilon, |
|
|
|
old_val_tensor = torch.DoubleTensor(old_values[name]) |
|
|
|
clipped_value_estimate = old_val_tensor + torch.clamp( |
|
|
|
torch.sum(head, dim=1) - old_val_tensor, -decay_epsilon, decay_epsilon |
|
|
|
v_opt_a = tf.math.squared_difference( |
|
|
|
returns[name], tf.reduce_sum(head, axis=1) |
|
|
|
) |
|
|
|
v_opt_b = tf.math.squared_difference(returns[name], clipped_value_estimate) |
|
|
|
value_loss = tf.reduce_mean(tf.maximum(v_opt_a, v_opt_b)) |
|
|
|
v_opt_a = (torch.DoubleTensor(returns[name]) - torch.sum(head, dim=1)) ** 2 |
|
|
|
v_opt_b = (torch.DoubleTensor(returns[name]) - clipped_value_estimate) ** 2 |
|
|
|
value_loss = torch.mean(torch.max(v_opt_a, v_opt_b)) |
|
|
|
value_loss = tf.reduce_mean(value_losses) |
|
|
|
value_loss = torch.mean(torch.stack(value_losses)) |
|
|
|
return value_loss |
|
|
|
|
|
|
|
def ppo_policy_loss(self, advantages, probs, old_probs, masks, epsilon): |
|
|
|
|
|
|
:param lr: Learning rate |
|
|
|
:param max_step: Total number of training steps. |
|
|
|
""" |
|
|
|
advantage = tf.expand_dims(advantages, -1) |
|
|
|
advantage = torch.from_numpy(np.expand_dims(advantages, -1)) |
|
|
|
r_theta = tf.exp(probs - old_probs) |
|
|
|
r_theta = torch.exp(probs - torch.DoubleTensor(old_probs)) |
|
|
|
tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) |
|
|
|
* advantage |
|
|
|
torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage |
|
|
|
policy_loss = -tf.reduce_mean(tf.minimum(p_opt_a, p_opt_b)) |
|
|
|
policy_loss = -torch.mean(torch.min(p_opt_a, p_opt_b)) |
|
|
|
# For cleaner stats reporting |
|
|
|
# abs_policy_loss = tf.abs(policy_loss) |
|
|
|
return policy_loss |
|
|
|
|
|
|
) |
|
|
|
self.update_dict.update(self.reward_signals[reward_signal].update_dict) |
|
|
|
|
|
|
|
@tf.function |
|
|
|
def execute_model(self, observations): |
|
|
|
action_dist = self.model(observations) |
|
|
|
action = action_dist.sample() |
|
|
|
|
|
|
action, log_probs, entropy, value_heads = self.execute_model( |
|
|
|
brain_info.vector_observations |
|
|
|
) |
|
|
|
run_out["action"] = np.array(action) |
|
|
|
run_out["log_probs"] = np.array(log_probs) |
|
|
|
run_out["entropy"] = np.array(entropy) |
|
|
|
run_out["value_heads"] = {name: np.array(t) for name, t in value_heads.items()} |
|
|
|
run_out["action"] = np.array(action.detach()) |
|
|
|
run_out["log_probs"] = np.array(log_probs.detach()) |
|
|
|
run_out["entropy"] = np.array(entropy.detach()) |
|
|
|
run_out["value_heads"] = { |
|
|
|
name: np.array(t.detach()) for name, t in value_heads.items() |
|
|
|
} |
|
|
|
run_out["value"] = np.mean(list(run_out["value_heads"].values()), 0) |
|
|
|
run_out["learning_rate"] = 0.0 |
|
|
|
self.model.update_normalization(brain_info.vector_observations) |
|
|
|
|
|
|
:param num_sequences: Number of sequences to process. |
|
|
|
:return: Results of update. |
|
|
|
""" |
|
|
|
with tf.GradientTape() as tape: |
|
|
|
returns = {} |
|
|
|
old_values = {} |
|
|
|
for name in self.reward_signals: |
|
|
|
returns[name] = mini_batch["{}_returns".format(name)] |
|
|
|
old_values[name] = mini_batch["{}_value_estimates".format(name)] |
|
|
|
returns = {} |
|
|
|
old_values = {} |
|
|
|
for name in self.reward_signals: |
|
|
|
returns[name] = mini_batch["{}_returns".format(name)] |
|
|
|
old_values[name] = mini_batch["{}_value_estimates".format(name)] |
|
|
|
obs = np.array(mini_batch["vector_obs"]) |
|
|
|
values = self.model.get_values(obs) |
|
|
|
dist = self.model(obs) |
|
|
|
probs = dist.log_prob(np.array(mini_batch["actions"])) |
|
|
|
entropy = dist.entropy() |
|
|
|
value_loss = self.ppo_value_loss(values, old_values, returns) |
|
|
|
policy_loss = self.ppo_policy_loss( |
|
|
|
np.array(mini_batch["advantages"]), |
|
|
|
probs, |
|
|
|
np.array(mini_batch["action_probs"]), |
|
|
|
np.array(mini_batch["masks"], dtype=np.uint32), |
|
|
|
1e-3, |
|
|
|
) |
|
|
|
loss = ( |
|
|
|
policy_loss |
|
|
|
+ 0.5 * value_loss |
|
|
|
- self.trainer_params["beta"] * tf.reduce_mean(entropy) |
|
|
|
) |
|
|
|
grads = tape.gradient(loss, self.model.trainable_weights) |
|
|
|
obs = np.array(mini_batch["vector_obs"]) |
|
|
|
values = self.model.get_values(obs) |
|
|
|
dist = self.model(obs) |
|
|
|
probs = dist.log_prob(torch.from_numpy(np.array(mini_batch["actions"]))) |
|
|
|
entropy = dist.entropy() |
|
|
|
value_loss = self.ppo_value_loss(values, old_values, returns) |
|
|
|
policy_loss = self.ppo_policy_loss( |
|
|
|
np.array(mini_batch["advantages"]), |
|
|
|
probs, |
|
|
|
np.array(mini_batch["action_probs"]), |
|
|
|
np.array(mini_batch["masks"], dtype=np.uint32), |
|
|
|
1e-3, |
|
|
|
) |
|
|
|
loss = ( |
|
|
|
policy_loss |
|
|
|
+ 0.5 * value_loss |
|
|
|
- self.trainer_params["beta"] * torch.mean(entropy) |
|
|
|
) |
|
|
|
self.optimizer.zero_grad() |
|
|
|
loss.backward() |
|
|
|
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) |
|
|
|
self.optimizer.step() |
|
|
|
update_stats["Losses/Policy Loss"] = abs(policy_loss) |
|
|
|
update_stats["Losses/Value Loss"] = value_loss |
|
|
|
update_stats["Losses/Policy Loss"] = abs(policy_loss.detach().numpy()) |
|
|
|
update_stats["Losses/Value Loss"] = value_loss.detach().numpy() |
|
|
|
# for stat_name, update_name in stats_needed.items(): |
|
|
|
# update_stats[stat_name] = update_vals[update_name] |
|
|
|
return update_stats |
|
|
|
|
|
|
Gets current model step. |
|
|
|
:return: current model step. |
|
|
|
""" |
|
|
|
step = self.global_step.numpy() |
|
|
|
step = self.global_step.detach().numpy() |
|
|
|
return step |
|
|
|
|
|
|
|
def increment_step(self, n_steps): |
|
|
|
|
|
|
self.global_step.assign(self.global_step + n_steps) |
|
|
|
self.global_step = self.global_step + n_steps |
|
|
|
return self.get_current_step() |