浏览代码

Replace torch.detach().cpu().numpy() with a utils method (#4406)

* Replace torch.detach().cpu().numpy() with a utils method

* Using item() in place of to_numpy()

* more use of item() and additional tests
/MLA-1734-demo-provider
GitHub 4 年前
当前提交
498934f9
共有 10 个文件被更改,包括 48 次插入50 次删除
  1. 4
      ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
  2. 12
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 4
      ml-agents/mlagents/trainers/ppo/optimizer_torch.py
  4. 13
      ml-agents/mlagents/trainers/sac/optimizer_torch.py
  5. 21
      ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py
  6. 9
      ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py
  7. 2
      ml-agents/mlagents/trainers/torch/components/bc/module.py
  8. 6
      ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py
  9. 19
      ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py
  10. 8
      ml-agents/mlagents/trainers/torch/utils.py

4
ml-agents/mlagents/trainers/optimizer/torch_optimizer.py


)
for name, estimate in value_estimates.items():
value_estimates[name] = estimate.detach().cpu().numpy()
next_value_estimate[name] = next_value_estimate[name].detach().cpu().numpy()
value_estimates[name] = ModelUtils.to_numpy(estimate)
next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])
if done:
for k in next_value_estimate:

12
ml-agents/mlagents/trainers/policy/torch_policy.py


action, log_probs, entropy, value_heads, memories = self.sample_actions(
vec_obs, vis_obs, masks=masks, memories=memories
)
run_out["action"] = action.detach().cpu().numpy()
run_out["pre_action"] = action.detach().cpu().numpy()
run_out["action"] = ModelUtils.to_numpy(action)
run_out["pre_action"] = ModelUtils.to_numpy(action)
run_out["log_probs"] = log_probs.detach().cpu().numpy()
run_out["entropy"] = entropy.detach().cpu().numpy()
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
run_out["entropy"] = ModelUtils.to_numpy(entropy)
name: t.detach().cpu().numpy() for name, t in value_heads.items()
name: ModelUtils.to_numpy(t) for name, t in value_heads.items()
run_out["memory_out"] = memories.detach().cpu().numpy().squeeze(0)
run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
return run_out
def get_action(

4
ml-agents/mlagents/trainers/ppo/optimizer_torch.py


self.optimizer.step()
update_stats = {
"Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()),
"Losses/Value Loss": value_loss.detach().cpu().numpy(),
"Losses/Policy Loss": policy_loss.item(),
"Losses/Value Loss": value_loss.item(),
"Policy/Learning Rate": decay_lr,
"Policy/Epsilon": decay_eps,
"Policy/Beta": decay_bet,

13
ml-agents/mlagents/trainers/sac/optimizer_torch.py


# Update target network
self.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau)
update_stats = {
"Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()),
"Losses/Value Loss": value_loss.detach().cpu().numpy(),
"Losses/Q1 Loss": q1_loss.detach().cpu().numpy(),
"Losses/Q2 Loss": q2_loss.detach().cpu().numpy(),
"Policy/Entropy Coeff": torch.exp(self._log_ent_coef)
.detach()
.cpu()
.numpy(),
"Losses/Policy Loss": policy_loss.item(),
"Losses/Value Loss": value_loss.item(),
"Losses/Q1 Loss": q1_loss.item(),
"Losses/Q2 Loss": q2_loss.item(),
"Policy/Entropy Coeff": torch.exp(self._log_ent_coef).item(),
"Policy/Learning Rate": decay_lr,
}

21
ml-agents/mlagents/trainers/tests/torch/test_bcmodule.py


import pytest
import mlagents.trainers.tests.mock_brain as mb
import numpy as np
import os
from mlagents.trainers.policy.torch_policy import TorchPolicy

default_num_epoch=3,
)
return bc_module
def assert_stats_are_float(stats):
for _, item in stats.items():
assert isinstance(item, float)
# Test default values

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with constant pretraining learning rate

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
old_learning_rate = bc_module.current_lr
_ = bc_module.update()

)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with discrete control and visual observations

)
bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
# Test with discrete control, visual observations and RNN

)
bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
stats = bc_module.update()
for _, item in stats.items():
assert isinstance(item, np.float32)
assert_stats_are_float(stats)
if __name__ == "__main__":

9
ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py


from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)
from mlagents.trainers.torch.utils import ModelUtils
SEED = [42]

buffer = create_agent_buffer(behavior_spec, 5)
for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0].detach()
target = buffer["actions"][0]
error = float(torch.mean((prediction - target) ** 2))
prediction = curiosity_rp._network.predict_action(buffer)[0]
target = torch.tensor(buffer["actions"][0])
error = torch.mean((prediction - target) ** 2).item()
assert error < 0.001

curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_next_state(buffer)[0]
target = curiosity_rp._network.get_next_state(buffer)[0]
error = float(torch.mean((prediction - target) ** 2).detach())
error = float(ModelUtils.to_numpy(torch.mean((prediction - target) ** 2)))
assert error < 0.001

2
ml-agents/mlagents/trainers/torch/components/bc/module.py


bc_loss.backward()
self.optimizer.step()
run_out = {"loss": bc_loss.detach().cpu().numpy()}
run_out = {"loss": bc_loss.item()}
return run_out

6
ml-agents/mlagents/trainers/torch/components/reward_providers/curiosity_reward_provider.py


def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
with torch.no_grad():
rewards = self._network.compute_reward(mini_batch).detach().cpu().numpy()
rewards = ModelUtils.to_numpy(self._network.compute_reward(mini_batch))
rewards = np.minimum(rewards, 1.0 / self.strength)
return rewards * self._has_updated_once

loss.backward()
self.optimizer.step()
return {
"Losses/Curiosity Forward Loss": forward_loss.detach().cpu().numpy(),
"Losses/Curiosity Inverse Loss": inverse_loss.detach().cpu().numpy(),
"Losses/Curiosity Forward Loss": forward_loss.item(),
"Losses/Curiosity Inverse Loss": inverse_loss.item(),
}
def get_modules(self):

19
ml-agents/mlagents/trainers/torch/components/reward_providers/gail_reward_provider.py


estimates, _ = self._discriminator_network.compute_estimate(
mini_batch, use_vail_noise=False
)
return (
return ModelUtils.to_numpy(
.detach()
.cpu()
.numpy()
)
def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:

expert_estimate, expert_mu = self.compute_estimate(
expert_batch, use_vail_noise=True
)
stats_dict["Policy/GAIL Policy Estimate"] = (
policy_estimate.mean().detach().cpu().numpy()
)
stats_dict["Policy/GAIL Expert Estimate"] = (
expert_estimate.mean().detach().cpu().numpy()
)
stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item()
stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item()
stats_dict["Losses/GAIL Loss"] = discriminator_loss.detach().cpu().numpy()
stats_dict["Losses/GAIL Loss"] = discriminator_loss.item()
total_loss += discriminator_loss
if self._settings.use_vail:
# KL divergence loss (encourage latent representation to be normal)

torch.tensor(0.0),
)
total_loss += vail_loss
stats_dict["Policy/GAIL Beta"] = self._beta.detach().cpu().numpy()
stats_dict["Losses/GAIL KL Loss"] = kl_loss.detach().cpu().numpy()
stats_dict["Policy/GAIL Beta"] = self._beta.item()
stats_dict["Losses/GAIL KL Loss"] = kl_loss.item()
if self.gradient_penalty_weight > 0.0:
total_loss += (
self.gradient_penalty_weight

8
ml-agents/mlagents/trainers/torch/utils.py


return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
@staticmethod
def to_numpy(tensor: torch.Tensor) -> np.ndarray:
"""
Converts a Torch Tensor to a numpy array. If the Tensor is on the GPU, it will
be brought to the CPU.
"""
return tensor.detach().cpu().numpy()
@staticmethod
def break_into_branches(
concatenated_logits: torch.Tensor, action_size: List[int]
) -> List[torch.Tensor]:

正在加载...
取消
保存