浏览代码

Merge branch 'develop-add-fire' into develop-add-fire-memoryclass

/develop/add-fire/memoryclass
Ervin Teng 4 年前
当前提交
a04e68a4
共有 7 个文件被更改,包括 195 次插入319 次删除
  1. 35
      ml-agents/mlagents/trainers/learn.py
  2. 55
      ml-agents/mlagents/trainers/policy/torch_policy.py
  3. 9
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 7
      ml-agents/mlagents/trainers/settings.py
  5. 10
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  6. 150
      ml-agents/mlagents/trainers/tests/torch/test_policy.py
  7. 248
      experiment_torch.py

35
ml-agents/mlagents/trainers/learn.py


)
from mlagents.trainers.cli_utils import parser
from mlagents_envs.environment import UnityEnvironment
from mlagents.trainers.settings import RunOptions, TestingConfiguration
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.training_status import GlobalTrainingStatus
from mlagents_envs.base_env import BaseEnv

add_metadata as add_timer_metadata,
)
from mlagents_envs import logging_util
from mlagents_envs.registry import default_registry
logger = logging_util.get_logger(__name__)

) -> UnityEnvironment:
# Make sure that each environment gets a different seed
env_seed = seed + worker_id
if TestingConfiguration.env_name == "":
return UnityEnvironment(
file_name=env_path,
worker_id=worker_id,
seed=env_seed,
no_graphics=no_graphics,
base_port=start_port,
additional_args=env_args,
side_channels=side_channels,
log_folder=log_folder,
)
else:
return default_registry[TestingConfiguration.env_name].make(
seed=env_seed,
no_graphics=no_graphics,
base_port=start_port,
worker_id=worker_id,
additional_args=env_args,
side_channels=side_channels,
log_folder=log_folder,
)
return UnityEnvironment(
file_name=env_path,
worker_id=worker_id,
seed=env_seed,
no_graphics=no_graphics,
base_port=start_port,
additional_args=env_args,
side_channels=side_channels,
log_folder=log_folder,
)
return create_unity_environment

55
ml-agents/mlagents/trainers/policy/torch_policy.py


from typing import Any, Dict, List
from typing import Any, Dict, List, Tuple, Optional
import numpy as np
import torch

from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings, TestingConfiguration
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.networks import (
SharedActorCritic,

) # could be much simpler if TorchPolicy is nn.Module
self.grads = None
if TestingConfiguration.device != "cpu":
torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
torch.set_default_tensor_type(torch.FloatTensor)
torch.set_default_tensor_type(torch.FloatTensor)
reward_signal_configs = trainer_settings.reward_signals
reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]

# m_size needed for training is determined by network, not trainer settings
self.m_size = self.actor_critic.memory_size
self.actor_critic.to(TestingConfiguration.device)
self.actor_critic.to("cpu")
@property
def export_memory_size(self) -> int:

"""
return self._export_m_size
def split_decision_step(self, decision_requests):
def _split_decision_step(
self, decision_requests: DecisionSteps
) -> Tuple[SplitObservations, np.ndarray]:
vec_vis_obs = SplitObservations.from_observations(decision_requests.obs)
mask = None
if not self.use_continuous_act:

1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return vec_vis_obs.vector_observations, vec_vis_obs.visual_observations, mask
return vec_vis_obs, mask
def update_normalization(self, vector_obs: np.ndarray) -> None:
"""

@timed
def sample_actions(
self,
vec_obs,
vis_obs,
masks=None,
memories=None,
seq_len=1,
all_log_probs=False,
):
vec_obs: List[torch.Tensor],
vis_obs: List[torch.Tensor],
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
all_log_probs: bool = False,
) -> Tuple[
torch.Tensor, torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], torch.Tensor
]:
"""
:param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
"""

)
def evaluate_actions(
self, vec_obs, vis_obs, actions, masks=None, memories=None, seq_len=1
):
self,
vec_obs: torch.Tensor,
vis_obs: torch.Tensor,
actions: torch.Tensor,
masks: Optional[torch.Tensor] = None,
memories: Optional[torch.Tensor] = None,
seq_len: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor]]:
if len(actions.shape) <= 2:
actions = actions.unsqueeze(-1)
action_list = [actions[..., i] for i in range(actions.shape[2])]
action_list = [actions[..., i] for i in range(actions.shape[-1])]
log_probs, entropies, _ = ModelUtils.get_probs_and_entropy(action_list, dists)
return log_probs, entropies, value_heads

:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
vec_obs, vis_obs, masks = self.split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_obs)]
vis_obs = [torch.as_tensor(vis_ob) for vis_ob in vis_obs]
vec_vis_obs, masks = self._split_decision_step(decision_requests)
vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
vis_obs = [
torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)

9
ml-agents/mlagents/trainers/ppo/trainer.py


from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import (
TrainerSettings,
PPOSettings,
TestingConfiguration,
FrameworkType,
)
from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
from mlagents.trainers.components.reward_signals import RewardSignal
try:

PPOSettings, self.trainer_settings.hyperparameters
)
self.seed = seed
if TestingConfiguration.max_steps > 0:
self.trainer_settings.max_steps = TestingConfiguration.max_steps
self.policy: Policy = None # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:

7
ml-agents/mlagents/trainers/settings.py


return {key: cattr.unstructure(val) for key, val in d.items()}
class TestingConfiguration:
use_torch = True
max_steps = 0
env_name = ""
device = "cpu"
class SerializationSettings:
convert_to_barracuda = True
convert_to_onnx = True

10
ml-agents/mlagents/trainers/trainer/rl_trainer.py


from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.settings import (
TestingConfiguration,
TrainerSettings,
FrameworkType,
)
from mlagents.trainers.settings import TrainerSettings, FrameworkType
from mlagents.trainers.saver.torch_saver import TorchSaver
from mlagents.trainers.saver.torch_saver import TorchSaver
except ModuleNotFoundError:
TorchPolicy = None # type: ignore

self.framework = self.trainer_settings.framework
logger.debug(f"Using framework {self.framework.value}")
if TestingConfiguration.max_steps > 0:
self.trainer_settings.max_steps = TestingConfiguration.max_steps
self._next_save_step = 0
self._next_summary_step = 0
self.saver = self.create_saver(

150
ml-agents/mlagents/trainers/tests/torch/test_policy.py


import pytest
import torch
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.settings import TrainerSettings, NetworkSettings
from mlagents.trainers.torch.utils import ModelUtils
VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 32
NUM_AGENTS = 12
EPSILON = 1e-7
def create_policy_mock(
dummy_config: TrainerSettings,
use_rnn: bool = False,
use_discrete: bool = True,
use_visual: bool = False,
seed: int = 0,
) -> TorchPolicy:
mock_spec = mb.setup_test_behavior_specs(
use_discrete,
use_visual,
vector_action_space=DISCRETE_ACTION_SPACE
if use_discrete
else VECTOR_ACTION_SPACE,
vector_obs_space=VECTOR_OBS_SPACE,
)
trainer_settings = dummy_config
trainer_settings.keep_checkpoints = 3
trainer_settings.network_settings.memory = (
NetworkSettings.MemorySettings() if use_rnn else None
)
policy = TorchPolicy(seed, mock_spec, trainer_settings)
return policy
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_policy_evaluate(rnn, visual, discrete):
# Test evaluate
policy = create_policy_mock(
TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
decision_step, terminal_step = mb.create_steps_from_behavior_spec(
policy.behavior_spec, num_agents=NUM_AGENTS
)
run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
if discrete:
run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
else:
assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_evaluate_actions(rnn, visual, discrete):
policy = create_policy_mock(
TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
if policy.use_continuous_act:
actions = ModelUtils.list_to_tensor(buffer["actions"]).unsqueeze(-1)
else:
actions = ModelUtils.list_to_tensor(buffer["actions"], dtype=torch.long)
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
vis_obs.append(vis_ob)
memories = [
ModelUtils.list_to_tensor(buffer["memory"][i])
for i in range(0, len(buffer["memory"]), policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
log_probs, entropy, values = policy.evaluate_actions(
vec_obs,
vis_obs,
masks=act_masks,
actions=actions,
memories=memories,
seq_len=policy.sequence_length,
)
assert log_probs.shape == (64, policy.behavior_spec.action_size)
assert entropy.shape == (64, policy.behavior_spec.action_size)
for val in values.values():
assert val.shape == (64,)
@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_sample_actions(rnn, visual, discrete):
policy = create_policy_mock(
TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
vis_obs = []
for idx, _ in enumerate(policy.actor_critic.network_body.visual_encoders):
vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
vis_obs.append(vis_ob)
memories = [
ModelUtils.list_to_tensor(buffer["memory"][i])
for i in range(0, len(buffer["memory"]), policy.sequence_length)
]
if len(memories) > 0:
memories = torch.stack(memories).unsqueeze(0)
(
sampled_actions,
log_probs,
entropies,
sampled_values,
memories,
) = policy.sample_actions(
vec_obs,
vis_obs,
masks=act_masks,
memories=memories,
seq_len=policy.sequence_length,
all_log_probs=not policy.use_continuous_act,
)
if discrete:
assert log_probs.shape == (
64,
sum(policy.behavior_spec.discrete_action_branches),
)
else:
assert log_probs.shape == (64, policy.behavior_spec.action_shape)
assert entropies.shape == (64, policy.behavior_spec.action_size)
for val in sampled_values.values():
assert val.shape == (64,)
if rnn:
assert memories.shape == (1, 1, policy.m_size)

248
experiment_torch.py


import json
import os
import torch
from mlagents.tf_utils import tf
import argparse
from mlagents.trainers.learn import run_cli, parse_command_line
from mlagents.trainers.settings import TestingConfiguration
from mlagents.trainers.stats import StatsReporter
from mlagents_envs.timers import _thread_timer_stacks
def run_experiment(
name: str,
steps: int,
use_torch: bool,
algo: str,
num_torch_threads: int,
use_gpu: bool,
num_envs: int = 1,
config_name=None,
):
TestingConfiguration.env_name = name
TestingConfiguration.max_steps = steps
TestingConfiguration.use_torch = use_torch
TestingConfiguration.device = "cuda:0" if use_gpu else "cpu"
if use_gpu:
tf.device("/GPU:0")
else:
tf.device("/device:CPU:0")
if not torch.cuda.is_available() and use_gpu:
return (
name,
str(steps),
str(use_torch),
algo,
str(num_torch_threads),
str(num_envs),
str(use_gpu),
"na",
"na",
"na",
"na",
"na",
"na",
"na",
)
if config_name is None:
config_name = name
run_options = parse_command_line(
[f"config/{algo}/{config_name}.yaml", "--num-envs", f"{num_envs}"]
)
run_options.checkpoint_settings.run_id = (
f"{name}_test_" + str(steps) + "_" + ("torch" if use_torch else "tf")
)
run_options.checkpoint_settings.force = True
# run_options.env_settings.num_envs = num_envs
for trainer_settings in run_options.behaviors.values():
trainer_settings.threaded = False
timers_path = os.path.join(
"results", run_options.checkpoint_settings.run_id, "run_logs", "timers.json"
)
if use_torch:
torch.set_num_threads(num_torch_threads)
run_cli(run_options)
StatsReporter.writers.clear()
StatsReporter.stats_dict.clear()
_thread_timer_stacks.clear()
with open(timers_path) as timers_json_file:
timers_json = json.load(timers_json_file)
total = timers_json["total"]
tc_advance = timers_json["children"]["TrainerController.start_learning"][
"children"
]["TrainerController.advance"]
evaluate = timers_json["children"]["TrainerController.start_learning"][
"children"
]["TrainerController.advance"]["children"]["env_step"]["children"][
"SubprocessEnvManager._take_step"
][
"children"
]
update = timers_json["children"]["TrainerController.start_learning"][
"children"
]["TrainerController.advance"]["children"]["trainer_advance"]["children"][
"_update_policy"
][
"children"
]
tc_advance_total = tc_advance["total"]
tc_advance_count = tc_advance["count"]
if use_torch:
if algo == "ppo":
update_total = update["TorchPPOOptimizer.update"]["total"]
update_count = update["TorchPPOOptimizer.update"]["count"]
else:
update_total = update["SACTrainer._update_policy"]["total"]
update_count = update["SACTrainer._update_policy"]["count"]
evaluate_total = evaluate["TorchPolicy.evaluate"]["total"]
evaluate_count = evaluate["TorchPolicy.evaluate"]["count"]
else:
if algo == "ppo":
update_total = update["PPOOptimizer.update"]["total"]
update_count = update["PPOOptimizer.update"]["count"]
else:
update_total = update["SACTrainer._update_policy"]["total"]
update_count = update["SACTrainer._update_policy"]["count"]
evaluate_total = evaluate["NNPolicy.evaluate"]["total"]
evaluate_count = evaluate["NNPolicy.evaluate"]["count"]
# todo: do total / count
return (
name,
str(steps),
str(use_torch),
algo,
str(num_torch_threads),
str(num_envs),
str(use_gpu),
str(total),
str(tc_advance_total),
str(tc_advance_count),
str(update_total),
str(update_count),
str(evaluate_total),
str(evaluate_count),
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--steps", default=25000, type=int, help="The number of steps")
parser.add_argument("--num-envs", default=1, type=int, help="The number of envs")
parser.add_argument(
"--gpu", default=False, action="store_true", help="If true, will use the GPU"
)
parser.add_argument(
"--threads",
default=False,
action="store_true",
help="If true, will try both 1 and 8 threads for torch",
)
parser.add_argument(
"--ball",
default=False,
action="store_true",
help="If true, will only do 3dball",
)
parser.add_argument(
"--sac",
default=False,
action="store_true",
help="If true, will run sac instead of ppo",
)
args = parser.parse_args()
if args.gpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
algo = "ppo"
if args.sac:
algo = "sac"
envs_config_tuples = [
("3DBall", "3DBall"),
("GridWorld", "GridWorld"),
("PushBlock", "PushBlock"),
("CrawlerStaticTarget", "CrawlerStatic"),
]
if algo == "ppo":
envs_config_tuples += [
("Hallway", "Hallway"),
("VisualHallway", "VisualHallway"),
]
if args.ball:
envs_config_tuples = [("3DBall", "3DBall")]
labels = (
"name",
"steps",
"use_torch",
"algorithm",
"num_torch_threads",
"num_envs",
"use_gpu",
"total",
"tc_advance_total",
"tc_advance_count",
"update_total",
"update_count",
"evaluate_total",
"evaluate_count",
)
results = []
results.append(labels)
f = open(
f"result_data_steps_{args.steps}_algo_{algo}_envs_{args.num_envs}_gpu_{args.gpu}_thread_{args.threads}.txt",
"w",
)
f.write(" ".join(labels) + "\n")
for env_config in envs_config_tuples:
data = run_experiment(
name=env_config[0],
steps=args.steps,
use_torch=True,
algo=algo,
num_torch_threads=1,
use_gpu=args.gpu,
num_envs=args.num_envs,
config_name=env_config[1],
)
results.append(data)
f.write(" ".join(data) + "\n")
if args.threads:
data = run_experiment(
name=env_config[0],
steps=args.steps,
use_torch=True,
algo=algo,
num_torch_threads=8,
use_gpu=args.gpu,
num_envs=args.num_envs,
config_name=env_config[1],
)
results.append(data)
f.write(" ".join(data) + "\n")
data = run_experiment(
name=env_config[0],
steps=args.steps,
use_torch=False,
algo=algo,
num_torch_threads=1,
use_gpu=args.gpu,
num_envs=args.num_envs,
config_name=env_config[1],
)
results.append(data)
f.write(" ".join(data) + "\n")
for r in results:
print(*r)
f.close()
if __name__ == "__main__":
main()
正在加载...
取消
保存