您最多选择25个主题
主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
101 行
3.9 KiB
101 行
3.9 KiB
import tensorflow as tf
|
|
import numpy as np
|
|
from mlagents.trainers.models import LearningModel
|
|
|
|
|
|
class BCModel(object):
|
|
def __init__(
|
|
self,
|
|
policy_model: LearningModel,
|
|
learning_rate: float = 3e-4,
|
|
anneal_steps: int = 0,
|
|
):
|
|
"""
|
|
Tensorflow operations to perform Behavioral Cloning on a Policy model
|
|
:param policy_model: The policy of the learning algorithm
|
|
:param lr: The initial learning Rate for behavioral cloning
|
|
:param anneal_steps: Number of steps over which to anneal BC training
|
|
"""
|
|
self.policy_model = policy_model
|
|
self.expert_visual_in = self.policy_model.visual_in
|
|
self.obs_in_expert = self.policy_model.vector_in
|
|
self.make_inputs()
|
|
self.create_loss(learning_rate, anneal_steps)
|
|
|
|
def make_inputs(self) -> None:
|
|
"""
|
|
Creates the input layers for the discriminator
|
|
"""
|
|
self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
|
|
self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
|
|
|
|
if self.policy_model.brain.vector_action_space_type == "continuous":
|
|
action_length = self.policy_model.act_size[0]
|
|
self.action_in_expert = tf.placeholder(
|
|
shape=[None, action_length], dtype=tf.float32
|
|
)
|
|
self.expert_action = tf.identity(self.action_in_expert)
|
|
else:
|
|
action_length = len(self.policy_model.act_size)
|
|
self.action_in_expert = tf.placeholder(
|
|
shape=[None, action_length], dtype=tf.int32
|
|
)
|
|
self.expert_action = tf.concat(
|
|
[
|
|
tf.one_hot(
|
|
self.action_in_expert[:, i], self.policy_model.act_size[i]
|
|
)
|
|
for i in range(len(self.policy_model.act_size))
|
|
],
|
|
axis=1,
|
|
)
|
|
|
|
def create_loss(self, learning_rate: float, anneal_steps: int) -> None:
|
|
"""
|
|
Creates the loss and update nodes for the BC module
|
|
:param learning_rate: The learning rate for the optimizer
|
|
:param anneal_steps: Number of steps over which to anneal the learning_rate
|
|
"""
|
|
selected_action = self.policy_model.output
|
|
action_size = self.policy_model.act_size
|
|
if self.policy_model.brain.vector_action_space_type == "continuous":
|
|
self.loss = tf.reduce_mean(
|
|
tf.squared_difference(selected_action, self.expert_action)
|
|
)
|
|
else:
|
|
log_probs = self.policy_model.all_log_probs
|
|
action_idx = [0] + list(np.cumsum(action_size))
|
|
entropy = tf.reduce_sum(
|
|
(
|
|
tf.stack(
|
|
[
|
|
tf.nn.softmax_cross_entropy_with_logits_v2(
|
|
labels=tf.nn.softmax(
|
|
log_probs[:, action_idx[i] : action_idx[i + 1]]
|
|
),
|
|
logits=log_probs[:, action_idx[i] : action_idx[i + 1]],
|
|
)
|
|
for i in range(len(action_size))
|
|
],
|
|
axis=1,
|
|
)
|
|
),
|
|
axis=1,
|
|
)
|
|
self.loss = tf.reduce_mean(
|
|
-tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
|
|
)
|
|
|
|
if anneal_steps > 0:
|
|
self.annealed_learning_rate = tf.train.polynomial_decay(
|
|
learning_rate,
|
|
self.policy_model.global_step,
|
|
anneal_steps,
|
|
0.0,
|
|
power=1.0,
|
|
)
|
|
else:
|
|
self.annealed_learning_rate = learning_rate
|
|
|
|
optimizer = tf.train.AdamOptimizer(learning_rate=self.annealed_learning_rate)
|
|
self.update_batch = optimizer.minimize(self.loss)
|