|
|
|
|
|
|
"""The PPOTrainer is an implementation of the PPO algorithm.""" |
|
|
|
|
|
|
|
def __init__( |
|
|
|
self, |
|
|
|
brain_name: str, |
|
|
|
reward_buff_cap: int, |
|
|
|
trainer_settings: TrainerSettings, |
|
|
|
training: bool, |
|
|
|
load: bool, |
|
|
|
seed: int, |
|
|
|
artifact_path: str, |
|
|
|
self, |
|
|
|
brain_name: str, |
|
|
|
reward_buff_cap: int, |
|
|
|
trainer_settings: TrainerSettings, |
|
|
|
training: bool, |
|
|
|
load: bool, |
|
|
|
seed: int, |
|
|
|
artifact_path: str, |
|
|
|
): |
|
|
|
""" |
|
|
|
Responsible for collecting experiences and training PPO model. |
|
|
|
|
|
|
agent_buffer_trajectory, |
|
|
|
trajectory.next_obs, |
|
|
|
trajectory.done_reached and not trajectory.interrupted, |
|
|
|
) |
|
|
|
) |
|
|
|
for name, v in value_estimates.items(): |
|
|
|
agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v) |
|
|
|
self._stats_reporter.add_stat( |
|
|
|
|
|
|
# Make sure batch_size is a multiple of sequence length. During training, we |
|
|
|
# will need to reshape the data into a batch_size x sequence_length tensor. |
|
|
|
batch_size = ( |
|
|
|
self.hyperparameters.batch_size |
|
|
|
- self.hyperparameters.batch_size % self.policy.sequence_length |
|
|
|
self.hyperparameters.batch_size |
|
|
|
- self.hyperparameters.batch_size % self.policy.sequence_length |
|
|
|
) |
|
|
|
# Make sure there is at least one sequence |
|
|
|
batch_size = max(batch_size, self.policy.sequence_length) |
|
|
|
|
|
|
for stat, val in update_stats.items(): |
|
|
|
self._stats_reporter.add_stat(stat, val) |
|
|
|
self._clear_update_buffer() |
|
|
|
|
|
|
|
# # # DYNAMIC BUFFER TEST |
|
|
|
# # if self.get_step in range(0, 50000): |
|
|
|
# # # self.expand_buffer(64, 64, 640) |
|
|
|
# # # self.expand_buffer(64, 2048, 2048) |
|
|
|
# # self.expand_buffer(64, 2048, 512) |
|
|
|
# batchBase = 256 #512 |
|
|
|
# bufferBase = 2560 #5120 |
|
|
|
# if self.get_step in range(0, 100000): |
|
|
|
# # self.expand_buffer(64, 128, 1280) |
|
|
|
# # self.expand_buffer(64, 512, 5120) |
|
|
|
# self.expand_buffer(64, batchBase * 1, bufferBase * 1) |
|
|
|
# # self.expand_buffer(64, 512, 1536) |
|
|
|
# # self.expand_buffer(64, 2048, 4096) |
|
|
|
# # self.expand_buffer(64, 2048, 1024) |
|
|
|
# elif self.get_step in range(100000, 300000): |
|
|
|
# # self.expand_buffer(64, 1024, 4096) |
|
|
|
# self.expand_buffer(64, batchBase * 2, bufferBase * 2) |
|
|
|
# # self.expand_buffer(128, 256, 2560) |
|
|
|
# elif self.get_step in range(300000, 2000000): |
|
|
|
# # self.expand_buffer(64, 2048, 8192) |
|
|
|
# self.expand_buffer(64, batchBase * 4, bufferBase * 4) |
|
|
|
# # self.expand_buffer(256, 512, 5120) |
|
|
|
# elif self.get_step in range(2000000, 4500000): |
|
|
|
# # self.expand_buffer(64, 2048, 16384) |
|
|
|
# # self.expand_buffer(64, 2048, 4096) |
|
|
|
# # elif self.get_step in range(2000000, 3500000): |
|
|
|
# # self.expand_buffer(512, 1024, 10240) |
|
|
|
# self.expand_buffer(1000, batchBase * 8, bufferBase * 8) |
|
|
|
# else: |
|
|
|
# # self.expand_buffer(64, 2048, 32768) |
|
|
|
# # self.expand_buffer(256, 512, 5120) |
|
|
|
# # self.expand_buffer(512, 1024, 10240) |
|
|
|
# self.expand_buffer(1000, batchBase * 8, bufferBase * 8) |
|
|
|
# # self.expand_buffer(1000, 2048, 20480) |
|
|
|
|
|
|
|
def expand_buffer(self, th, batch, buffer): |
|
|
|
# setattr(self.hyperparameters, "buffer_size", buffer) |
|
|
|
# self.hyperparameters["buffer_size"] = buffer |
|
|
|
# print(f'updating policy: self.get_step: {self.get_step} | buffer_size: {self.hyperparameters.buffer_size}.') |
|
|
|
# print(f'updating policy: self.get_step: {self.get_step} | buffer_size: {self.hyperparameters["buffer_size"]}.') |
|
|
|
# print(f'updating policy: self.get_step: {self.get_step} | batch_size: {self.trainer_parameters["batch_size"]} | buffer_size: {self.trainer_parameters["buffer_size"]}.') |
|
|
|
# print(self.trainer_parameters["time_horizon"]) |
|
|
|
# print(self.trainer_parameters["batch_size"]) |
|
|
|
# print(self.trainer_parameters["buffer_size"]) |
|
|
|
# self.trainer_parameters["time_horizon"] = th |
|
|
|
# self.trainer_parameters["batch_size"] = batch |
|
|
|
# self.trainer_settings["buffer_size"] = buffer |
|
|
|
self.hyperparameters.batch_size = batch |
|
|
|
self.hyperparameters.buffer_size = buffer |
|
|
|
|
|
|
|
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec |
|
|
|
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec |
|
|
|
) -> TFPolicy: |
|
|
|
""" |
|
|
|
Creates a PPO policy to trainers list of policies. |
|
|
|
|
|
|
return policy |
|
|
|
|
|
|
|
def add_policy( |
|
|
|
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy |
|
|
|
self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy |
|
|
|
) -> None: |
|
|
|
""" |
|
|
|
Adds policy to trainer. |
|
|
|