|
|
|
|
|
|
|
|
|
|
def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy: |
|
|
|
if self.multi_gpu and len(get_devices()) > 1: |
|
|
|
policy = MultiGpuPPOPolicy( |
|
|
|
self.ppo_policy = MultiGpuPPOPolicy( |
|
|
|
self.seed, |
|
|
|
brain_parameters, |
|
|
|
self.trainer_parameters, |
|
|
|
|
|
|
else: |
|
|
|
policy = PPOPolicy( |
|
|
|
self.ppo_policy = PPOPolicy( |
|
|
|
self.seed, |
|
|
|
brain_parameters, |
|
|
|
self.trainer_parameters, |
|
|
|
|
|
|
|
|
|
|
for _reward_signal in policy.reward_signals.keys(): |
|
|
|
for _reward_signal in self.ppo_policy.reward_signals.keys(): |
|
|
|
self.ppo_policy = policy |
|
|
|
|
|
|
|
return policy |
|
|
|
return self.ppo_policy |
|
|
|
|
|
|
|
|
|
|
|
def discount_rewards(r, gamma=0.99, value_next=0.0): |
|
|
|