浏览代码

Merge branch 'main' into develop-soccer-groupman-mod

/develop/soccer-groupman/mod
Andrew Cohen 4 年前
当前提交
9176247c
共有 10 个文件被更改,包括 45 次插入44 次删除
  1. 1
      com.unity.ml-agents/CHANGELOG.md
  2. 2
      ml-agents/mlagents/trainers/poca/trainer.py
  3. 2
      ml-agents/mlagents/trainers/ppo/trainer.py
  4. 18
      ml-agents/mlagents/trainers/sac/trainer.py
  5. 8
      ml-agents/mlagents/trainers/trainer/rl_trainer.py
  6. 4
      ml-agents/mlagents/trainers/trainer/trainer.py
  7. 2
      ml-agents/mlagents/trainers/trainer_controller.py
  8. 26
      config/poca/PushBlockCollab.yaml
  9. 26
      config/ppo/PushBlockCollab.yaml

1
com.unity.ml-agents/CHANGELOG.md


#### ml-agents / ml-agents-envs / gym-unity (Python)
- An issue that caused `GAIL` to fail for environments where agents can terminate episodes by self-sacrifice has been fixed. (#4971)
- Made the error message when observations of different shapes are sent to the trainer clearer. (#5030)
- An issue that prevented curriculums from incrementing with self-play has been fixed. (#5098)
## [1.8.1-preview] - 2021-03-08
### Minor Changes

2
ml-agents/mlagents/trainers/poca/trainer.py


self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()
self._step = policy.get_current_step()
def get_policy(self, name_behavior_id: str) -> Policy:
"""

2
ml-agents/mlagents/trainers/ppo/trainer.py


self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()
self._step = policy.get_current_step()
def get_policy(self, name_behavior_id: str) -> Policy:
"""

18
ml-agents/mlagents/trainers/sac/trainer.py


self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)
self.step = 0
self._step = 0
# Don't divide by zero
self.update_steps = 1

"""
return (
self.update_buffer.num_experiences >= self.hyperparameters.batch_size
and self.step >= self.hyperparameters.buffer_init_steps
and self._step >= self.hyperparameters.buffer_init_steps
)
@timed

batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
self._step - self.hyperparameters.buffer_init_steps
logger.debug(f"Updating SAC policy at step {self.step}")
logger.debug(f"Updating SAC policy at step {self._step}")
buffer = self.update_buffer
if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
sampled_minibatch = buffer.sample_mini_batch(

)
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
self._step - self.hyperparameters.buffer_init_steps
logger.debug(f"Updating {name} at step {self.step}")
logger.debug(f"Updating {name} at step {self._step}")
if name != "extrinsic":
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,

self.model_saver.initialize_or_load()
# Needed to resume loads properly
self.step = policy.get_current_step()
self._step = policy.get_current_step()
self.update_steps = int(max(1, self.step / self.steps_per_update))
self.update_steps = int(max(1, self._step / self.steps_per_update))
max(1, self.step / self.reward_signal_steps_per_update)
max(1, self._step / self.reward_signal_steps_per_update)
)
def get_policy(self, name_behavior_id: str) -> Policy:

8
ml-agents/mlagents/trainers/trainer/rl_trainer.py


logger.warning(
"Trainer has multiple policies, but default behavior only saves the first."
)
checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self.step)
checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self._step)
int(self.step),
int(self._step),
f"{checkpoint_path}.{export_ext}",
self._policy_mean_reward(),
time.time(),

Increment the step count of the trainer
:param n_steps: number of steps to increment the step count by
"""
self.step += n_steps
self._step += n_steps
self._next_summary_step = self._get_next_interval_step(self.summary_freq)
self._next_save_step = self._get_next_interval_step(
self.trainer_settings.checkpoint_interval

Get the next step count that should result in an action.
:param interval: The interval between actions.
"""
return self.step + (interval - self.step % interval)
return self._step + (interval - self._step % interval)
def _write_summary(self, step: int) -> None:
"""

4
ml-agents/mlagents/trainers/trainer/trainer.py


self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
self.policy_queues: List[AgentManagerQueue[Policy]] = []
self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
self.step: int = 0
self._step: int = 0
self.artifact_path = artifact_path
self.summary_freq = self.trainer_settings.summary_freq
self.policies: Dict[str, Policy] = {}

Returns the number of steps the trainer has performed
:return: the step count of the trainer
"""
return self.step
return self._step
@property
def threaded(self) -> bool:

2
ml-agents/mlagents/trainers/trainer_controller.py


def reset_env_if_ready(self, env: EnvManager) -> None:
# Get the sizes of the reward buffers.
reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()}
curr_step = {k: int(t.step) for (k, t) in self.trainers.items()}
curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()}
max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()}
# Attempt to increment the lessons of the brains who
# were ready.

26
config/poca/PushBlockCollab.yaml


behaviors:
PushBlockCollab:
trainer_type: poca
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 64
summary_freq: 60000
threaded: true

26
config/ppo/PushBlockCollab.yaml


behaviors:
PushBlockCollab:
trainer_type: poca
hyperparameters:
batch_size: 1024
buffer_size: 10240
learning_rate: 0.0003
beta: 0.01
epsilon: 0.2
lambd: 0.95
num_epoch: 3
learning_rate_schedule: constant
network_settings:
normalize: false
hidden_units: 256
num_layers: 2
vis_encode_type: simple
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
keep_checkpoints: 5
max_steps: 20000000
time_horizon: 64
summary_freq: 60000
threaded: true
正在加载...
取消
保存