ml-agents/ml-agents/mlagents/trainers/demo_loader.py


								import os

								from typing import List, Tuple

								import numpy as np

								from mlagents.trainers.buffer import AgentBuffer, BufferKey

								from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (

								    AgentInfoActionPairProto,

								)

								from mlagents.trainers.trajectory import ObsUtil

								from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto

								from mlagents_envs.base_env import BehaviorSpec

								from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto

								from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (

								    DemonstrationMetaProto,

								)

								from mlagents_envs.timers import timed, hierarchical_timer

								from google.protobuf.internal.decoder import _DecodeVarint32  # type: ignore

								from google.protobuf.internal.encoder import _EncodeVarint  # type: ignore


								INITIAL_POS = 33

								SUPPORTED_DEMONSTRATION_VERSIONS = frozenset([0, 1])


								@timed

								def make_demo_buffer(

								    pair_infos: List[AgentInfoActionPairProto],

								    behavior_spec: BehaviorSpec,

								    sequence_length: int,

								) -> AgentBuffer:

								    # Create and populate buffer using experiences

								    demo_raw_buffer = AgentBuffer()

								    demo_processed_buffer = AgentBuffer()

								    for idx, current_pair_info in enumerate(pair_infos):

								        if idx > len(pair_infos) - 2:

								            break

								        next_pair_info = pair_infos[idx + 1]

								        current_decision_step, current_terminal_step = steps_from_proto(

								            [current_pair_info.agent_info], behavior_spec

								        )

								        next_decision_step, next_terminal_step = steps_from_proto(

								            [next_pair_info.agent_info], behavior_spec

								        )

								        previous_action = (

								            np.array(

								                pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32

								            )

								            * 0

								        )

								        if idx > 0:

								            previous_action = np.array(

								                pair_infos[idx - 1].action_info.vector_actions_deprecated,

								                dtype=np.float32,

								            )


								        next_done = len(next_terminal_step) == 1

								        next_reward = 0

								        if len(next_terminal_step) == 1:

								            next_reward = next_terminal_step.reward[0]

								        else:

								            next_reward = next_decision_step.reward[0]

								        current_obs = None

								        if len(current_terminal_step) == 1:

								            current_obs = list(current_terminal_step.values())[0].obs

								        else:

								            current_obs = list(current_decision_step.values())[0].obs


								        demo_raw_buffer[BufferKey.DONE].append(next_done)

								        demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward)

								        for i, obs in enumerate(current_obs):

								            demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)

								        if (

								            len(current_pair_info.action_info.continuous_actions) == 0

								            and len(current_pair_info.action_info.discrete_actions) == 0

								        ):

								            if behavior_spec.action_spec.continuous_size > 0:

								                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(

								                    current_pair_info.action_info.vector_actions_deprecated

								                )

								            else:

								                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(

								                    current_pair_info.action_info.vector_actions_deprecated

								                )

								        else:

								            if behavior_spec.action_spec.continuous_size > 0:

								                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(

								                    current_pair_info.action_info.continuous_actions

								                )

								            if behavior_spec.action_spec.discrete_size > 0:

								                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(

								                    current_pair_info.action_info.discrete_actions

								                )

								        demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action)

								        if next_done:

								            demo_raw_buffer.resequence_and_append(

								                demo_processed_buffer, batch_size=None, training_length=sequence_length

								            )

								            demo_raw_buffer.reset_agent()

								    demo_raw_buffer.resequence_and_append(

								        demo_processed_buffer, batch_size=None, training_length=sequence_length

								    )

								    return demo_processed_buffer


								@timed

								def demo_to_buffer(

								    file_path: str, sequence_length: int, expected_behavior_spec: BehaviorSpec = None

								) -> Tuple[BehaviorSpec, AgentBuffer]:

								    """

								    Loads demonstration file and uses it to fill training buffer.

								    :param file_path: Location of demonstration file (.demo).

								    :param sequence_length: Length of trajectories to fill buffer.

								    :return:

								    """

								    behavior_spec, info_action_pair, _ = load_demonstration(file_path)

								    demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)

								    if expected_behavior_spec:

								        # check action dimensions in demonstration match

								        if behavior_spec.action_spec != expected_behavior_spec.action_spec:

								            raise RuntimeError(

								                "The actions {} in demonstration do not match the policy's {}.".format(

								                    behavior_spec.action_spec, expected_behavior_spec.action_spec

								                )

								            )

								        # check observations match

								        if len(behavior_spec.observation_specs) != len(

								            expected_behavior_spec.observation_specs

								        ):

								            raise RuntimeError(

								                "The demonstrations do not have the same number of observations as the policy."

								            )

								        else:

								            for i, (demo_obs, policy_obs) in enumerate(

								                zip(

								                    behavior_spec.observation_specs,

								                    expected_behavior_spec.observation_specs,

								                )

								            ):

								                if demo_obs.shape != policy_obs.shape:

								                    raise RuntimeError(

								                        f"The shape {demo_obs} for observation {i} in demonstration \

								                        do not match the policy's {policy_obs}."

								                    )

								    return behavior_spec, demo_buffer


								def get_demo_files(path: str) -> List[str]:

								    """

								    Retrieves the demonstration file(s) from a path.

								    :param path: Path of demonstration file or directory.

								    :return: List of demonstration files


								    Raises errors if |path| is invalid.

								    """

								    if os.path.isfile(path):

								        if not path.endswith(".demo"):

								            raise ValueError("The path provided is not a '.demo' file.")

								        return [path]

								    elif os.path.isdir(path):

								        paths = [

								            os.path.join(path, name)

								            for name in os.listdir(path)

								            if name.endswith(".demo")

								        ]

								        if not paths:

								            raise ValueError("There are no '.demo' files in the provided directory.")

								        return paths

								    else:

								        raise FileNotFoundError(

								            f"The demonstration file or directory {path} does not exist."

								        )


								@timed

								def load_demonstration(

								    file_path: str,

								) -> Tuple[BehaviorSpec, List[AgentInfoActionPairProto], int]:

								    """

								    Loads and parses a demonstration file.

								    :param file_path: Location of demonstration file (.demo).

								    :return: BrainParameter and list of AgentInfoActionPairProto containing demonstration data.

								    """


								    # First 32 bytes of file dedicated to meta-data.

								    file_paths = get_demo_files(file_path)

								    behavior_spec = None

								    brain_param_proto = None

								    info_action_pairs = []

								    total_expected = 0

								    for _file_path in file_paths:

								        with open(_file_path, "rb") as fp:

								            with hierarchical_timer("read_file"):

								                data = fp.read()

								            next_pos, pos, obs_decoded = 0, 0, 0

								            while pos < len(data):

								                next_pos, pos = _DecodeVarint32(data, pos)

								                if obs_decoded == 0:

								                    meta_data_proto = DemonstrationMetaProto()

								                    meta_data_proto.ParseFromString(data[pos : pos + next_pos])

								                    if (

								                        meta_data_proto.api_version

								                        not in SUPPORTED_DEMONSTRATION_VERSIONS

								                    ):

								                        raise RuntimeError(

								                            f"Can't load Demonstration data from an unsupported version ({meta_data_proto.api_version})"

								                        )

								                    total_expected += meta_data_proto.number_steps

								                    pos = INITIAL_POS

								                if obs_decoded == 1:

								                    brain_param_proto = BrainParametersProto()

								                    brain_param_proto.ParseFromString(data[pos : pos + next_pos])

								                    pos += next_pos

								                if obs_decoded > 1:

								                    agent_info_action = AgentInfoActionPairProto()

								                    agent_info_action.ParseFromString(data[pos : pos + next_pos])

								                    if behavior_spec is None:

								                        behavior_spec = behavior_spec_from_proto(

								                            brain_param_proto, agent_info_action.agent_info

								                        )

								                    info_action_pairs.append(agent_info_action)

								                    if len(info_action_pairs) == total_expected:

								                        break

								                    pos += next_pos

								                obs_decoded += 1

								    if not behavior_spec:

								        raise RuntimeError(

								            f"No BrainParameters found in demonstration file at {file_path}."

								        )

								    return behavior_spec, info_action_pairs, total_expected


								def write_delimited(f, message):

								    msg_string = message.SerializeToString()

								    msg_size = len(msg_string)

								    _EncodeVarint(f.write, msg_size)

								    f.write(msg_string)


								def write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos):

								    with open(demo_path, "wb") as f:

								        # write metadata

								        write_delimited(f, meta_data_proto)

								        f.seek(INITIAL_POS)

								        write_delimited(f, brain_param_proto)


								        for agent in agent_info_protos:

								            write_delimited(f, agent)