|
|
|
|
|
|
return RewardSignalResult(scaled_reward, unscaled_reward) |
|
|
|
|
|
|
|
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult: |
|
|
|
env_rews = np.array(mini_batch["environment_rewards"]) |
|
|
|
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32) |
|
|
|
return RewardSignalResult(self.strength * env_rews, env_rews) |