import numpy as np history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons', 'value_estimates', 'advantages', 'discounted_returns'] def discount_rewards(r, gamma=0.99, value_next=0.0): """ Computes discounted sum of future rewards for use in updating value estimate. :param r: List of rewards. :param gamma: Discount factor. :param value_next: T+1 value estimate for returns calculation. :return: discounted sum of future rewards as list. """ discounted_r = np.zeros_like(r) running_add = value_next for t in reversed(range(0, r.size)): running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95): """ Computes generalized advantage estimate for use in updating policy. :param rewards: list of rewards for time-steps t to T. :param value_next: Value estimate for time-step T+1. :param value_estimates: list of value estimates for time-steps t to T. :param gamma: Discount factor. :param lambd: GAE weighing factor. :return: list of advantage estimates for time-steps t to T. """ value_estimates = np.asarray(value_estimates.tolist() + [value_next]) delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1] advantage = discount_rewards(r=delta_t, gamma=gamma*lambd) return advantage def empty_local_history(agent_dict): """ Empties the experience history for a single agent. :param agent_dict: Dictionary of agent experience history. :return: Emptied dictionary (except for cumulative_reward and episode_steps). """ for key in history_keys: agent_dict[key] = [] return agent_dict def vectorize_history(agent_dict): """ Converts dictionary of lists into dictionary of numpy arrays. :param agent_dict: Dictionary of agent experience history. :return: dictionary of numpy arrays. """ for key in history_keys: agent_dict[key] = np.array(agent_dict[key]) return agent_dict def empty_all_history(agent_info): """ Clears all agent histories and resets reward and episode length counters. :param agent_info: a BrainInfo object. :return: an emptied history dictionary. """ history_dict = {} for agent in agent_info.agents: history_dict[agent] = {} history_dict[agent] = empty_local_history(history_dict[agent]) history_dict[agent]['cumulative_reward'] = 0 history_dict[agent]['episode_steps'] = 0 return history_dict def append_history(global_buffer, local_buffer=None): """ Appends agent experience history to global history buffer. :param global_buffer: Global buffer for all agents experiences. :param local_buffer: Local history for individual agents experiences. :return: Global buffer with new experiences added. """ for key in history_keys: global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0) return global_buffer def set_history(global_buffer, local_buffer=None): """ Creates new global_buffer from existing local_buffer :param global_buffer: Global buffer for all agents experiences. :param local_buffer: Local history for individual agents experiences. :return: Global buffer with new experiences. """ for key in history_keys: global_buffer[key] = np.copy(local_buffer[key]) return global_buffer def shuffle_buffer(global_buffer): """ Randomizes experiences in global_buffer :param global_buffer: training_buffer to randomize. :return: Randomized buffer """ s = np.arange(global_buffer[history_keys[2]].shape[0]) for key in history_keys: if len(global_buffer[key]) > 0: global_buffer[key] = global_buffer[key][s] return global_buffer