Unity 机器学习代理工具包 (ML-Agents) 是一个开源项目,它使游戏和模拟能够作为训练智能代理的环境。
您最多选择25个主题 主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 

101 行
3.9 KiB

import tensorflow as tf
import numpy as np
from mlagents.trainers.models import LearningModel
class BCModel(object):
def __init__(
self,
policy_model: LearningModel,
learning_rate: float = 3e-4,
anneal_steps: int = 0,
):
"""
Tensorflow operations to perform Behavioral Cloning on a Policy model
:param policy_model: The policy of the learning algorithm
:param lr: The initial learning Rate for behavioral cloning
:param anneal_steps: Number of steps over which to anneal BC training
"""
self.policy_model = policy_model
self.expert_visual_in = self.policy_model.visual_in
self.obs_in_expert = self.policy_model.vector_in
self.make_inputs()
self.create_loss(learning_rate, anneal_steps)
def make_inputs(self) -> None:
"""
Creates the input layers for the discriminator
"""
self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
if self.policy_model.brain.vector_action_space_type == "continuous":
action_length = self.policy_model.act_size[0]
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.float32
)
self.expert_action = tf.identity(self.action_in_expert)
else:
action_length = len(self.policy_model.act_size)
self.action_in_expert = tf.placeholder(
shape=[None, action_length], dtype=tf.int32
)
self.expert_action = tf.concat(
[
tf.one_hot(
self.action_in_expert[:, i], self.policy_model.act_size[i]
)
for i in range(len(self.policy_model.act_size))
],
axis=1,
)
def create_loss(self, learning_rate: float, anneal_steps: int) -> None:
"""
Creates the loss and update nodes for the BC module
:param learning_rate: The learning rate for the optimizer
:param anneal_steps: Number of steps over which to anneal the learning_rate
"""
selected_action = self.policy_model.output
action_size = self.policy_model.act_size
if self.policy_model.brain.vector_action_space_type == "continuous":
self.loss = tf.reduce_mean(
tf.squared_difference(selected_action, self.expert_action)
)
else:
log_probs = self.policy_model.all_log_probs
action_idx = [0] + list(np.cumsum(action_size))
entropy = tf.reduce_sum(
(
tf.stack(
[
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.nn.softmax(
log_probs[:, action_idx[i] : action_idx[i + 1]]
),
logits=log_probs[:, action_idx[i] : action_idx[i + 1]],
)
for i in range(len(action_size))
],
axis=1,
)
),
axis=1,
)
self.loss = tf.reduce_mean(
-tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
)
if anneal_steps > 0:
self.annealed_learning_rate = tf.train.polynomial_decay(
learning_rate,
self.policy_model.global_step,
anneal_steps,
0.0,
power=1.0,
)
else:
self.annealed_learning_rate = learning_rate
optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate)
self.update_batch = optimizer.minimize(self.loss)