fix bug with bisimulation

5 年前 · 9bc90956
--- a/ml-agents/mlagents/trainers/policy/transfer_policy.py
+++ b/ml-agents/mlagents/trainers/policy/transfer_policy.py
        return kl

    def w_distance(self, another):
-        return tf.squared_difference(self.mu, another.mu) + tf.squared_difference(self.sigma, another.sigma)
+        return tf.reduce_mean(tf.squared_difference(self.mu, another.mu))\
+             + tf.reduce_mean(tf.squared_difference(self.sigma, another.sigma))


 class TransferPolicy(TFPolicy):
        self.trainable_variables: List[tf.Variable] = []
        self.encoder = None
        self.encoder_distribution = None
-        self.targ_encoder = None
+        self.next_encoder = None
+        self.target_encoder = None

        
        # Non-exposed parameters; these aren't exposed because they don't have a
                    self.vis_encode_type
                )

-                self.targ_encoder = self._create_target_encoder(
+                self.next_encoder = self._create_next_encoder(
-                    reuse_encoder
+                    reuse_encoder=True
+                self.target_encoder = self._create_target_encoder(
+                    self.visual_in,
+                    self.processed_vector_in,
+                    self.h_size,
+                    self.feature_size,
+                    encoder_layers,
+                    self.vis_encode_type
+                )
+            # used to encode the next state
+            self.next_encoder = tf.stop_gradient(self.next_encoder)
+            # a stable version, used to compute the value
+            self.target_encoder = tf.stop_gradient(self.target_encoder)
+            self._create_hard_copy()
-            if not reuse_encoder:
-                self.targ_encoder = tf.stop_gradient(self.targ_encoder)
-                self._create_hard_copy()
-            
-                    self.create_inverse_model(self.encoder, self.targ_encoder, inverse_layers)
+                    self.create_inverse_model(self.encoder, self.next_encoder, inverse_layers)
-                self.create_forward_model(self.encoder, self.targ_encoder, forward_layers,
+                self.create_forward_model(self.encoder, self.next_encoder, forward_layers,
-                    self.create_reward_model(self.encoder, self.targ_encoder, forward_layers)
+                    self.create_reward_model(self.encoder, self.next_encoder, forward_layers)
            
            if self.use_bisim:
                self.create_bisim_model(self.h_size, self.feature_size, encoder_layers,
        return predict

  
-
    @timed
    def evaluate(
        self, decision_requests: DecisionSteps, global_agent_ids: List[str]
        run_out = self._execute_model(feed_dict, self.inference_dict)
        return run_out

-    def _create_target_encoder(
+    def _create_next_encoder(
        self,
        h_size: int,
        feature_size: int,
        if reuse_encoder:
            next_encoder_scope = "encoding"
        else:
-            next_encoder_scope = "target_enc"
+            next_encoder_scope = "next_enc"

        self.visual_next = ModelUtils.create_visual_input_placeholders(
            self.brain.camera_resolutions
                    kernel_initializer=tf.initializers.variance_scaling(1.0),
                )
        return latent
+
+    def _create_target_encoder(
+        self,
+        visual_in: List[tf.Tensor],
+        vector_in: tf.Tensor,
+        h_size: int,
+        feature_size: int,
+        num_layers: int,
+        vis_encode_type: EncoderType,
+    ) -> tf.Tensor:
+        """
+        Creates an encoder for visual and vector observations.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: Type of visual encoder to use if visual input.
+        :return: The hidden layer (tf.Tensor) after the encoder.
+        """
+        with tf.variable_scope("target_enc"):
+            hidden_stream = ModelUtils.create_observation_streams(
+                visual_in,
+                vector_in,
+                1,
+                h_size,
+                num_layers,
+                vis_encode_type,
+            )[0]
+
+            latent = tf.layers.dense(
+                    hidden_stream,
+                    feature_size,
+                    name="latent",
+                    # activation=ModelUtils.swish,
+                    kernel_initializer=tf.initializers.variance_scaling(1.0),
+                )
+        return latent
    
    def _create_var_target_encoder(
        self,
            self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]

    def run_hard_copy(self):
+        # print("before:")
+        # self.get_encoder_weights()
        self.sess.run(self.target_replace_op)

    def _create_inverse_model(
            rew = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "reward")
            print("reward:", self.sess.run(rew))
    
-    def create_encoders(self, var_latent: bool=False, reuse_encoder: bool=False) -> Tuple[tf.Tensor, tf.Tensor]:
-        encoded_state_list = []
-        encoded_next_state_list = []
-        if reuse_encoder:
-            next_encoder_scope = "encoding"
-        else:
-            next_encoder_scope = "target_enc"
-        if self.vis_obs_size > 0:
-            self.next_visual_in = []
-            visual_encoders = []
-            next_visual_encoders = []
-            for i in range(self.vis_obs_size):
-                # Create input ops for next (t+1) visual observations.
-                next_visual_input = ModelUtils.create_visual_input(
-                    self.brain.camera_resolutions[i],
-                    name="next_visual_observation_" + str(i),
-                )
-                self.next_visual_in.append(next_visual_input)
-
-                # Create the encoder ops for current and next visual input.
-                # Note that these encoders are siamese.
-                with tf.variable_scope("encoding"):
-                    encoded_visual = ModelUtils.create_visual_observation_encoder(
-                        self.visual_in[i],
-                        self.h_size,
-                        ModelUtils.swish,
-                        self.num_layers,
-                        "stream_{}_visual_obs_encoder".format(i),
-                        False,
-                    )
-                
-                with tf.variable_scope(next_encoder_scope):
-                    encoded_next_visual = ModelUtils.create_visual_observation_encoder(
-                        self.next_visual_in[i],
-                        self.h_size,
-                        ModelUtils.swish,
-                        self.num_layers,
-                        "stream_{}_visual_obs_encoder".format(i),
-                        reuse_encoder
-                    )
-
-                visual_encoders.append(encoded_visual)
-                next_visual_encoders.append(encoded_next_visual)
-
-            hidden_visual = tf.concat(visual_encoders, axis=1)
-            hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
-            encoded_state_list.append(hidden_visual)
-            encoded_next_state_list.append(hidden_next_visual)
-
-        if self.vec_obs_size > 0:
-            # Create the encoder ops for current and next vector input.
-            # Note that these encoders are siamese.
-            # Create input op for next (t+1) vector observation.
-            self.next_vector_in = tf.placeholder(
-                shape=[None, self.vec_obs_size],
-                dtype=tf.float32,
-                name="next_vector_observation",
-            )
-
-            if self.normalize:
-                self.processed_vector_next = ModelUtils.normalize_vector_obs(
-                    self.next_vector_in,
-                    self.running_mean,
-                    self.running_variance,
-                    self.normalization_steps,
-                )
-            else:
-                self.processed_vector_next = self.next_vector_in
-
-            with tf.variable_scope("encoding"):
-                encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
-                    self.vector_in,
-                    self.h_size,
-                    ModelUtils.swish,
-                    self.num_layers,
-                    "vector_obs_encoder",
-                    False,
-                )
-            with tf.variable_scope(next_encoder_scope):
-                encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
-                    self.processed_vector_next,
-                    self.h_size,
-                    ModelUtils.swish,
-                    self.num_layers,
-                    "vector_obs_encoder",
-                    reuse_encoder
-                )
-            encoded_state_list.append(encoded_vector_obs)
-            encoded_next_state_list.append(encoded_next_vector_obs)
-
-        encoded_state = tf.concat(encoded_state_list, axis=1)
-        encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
-
-        if var_latent:
-            with tf.variable_scope("encoding/latent"):
-                encoded_state_dist = GaussianEncoderDistribution(
-                    encoded_state,
-                    self.feature_size,
-                )
-                encoded_state = encoded_state_dist.sample()
-            
-            with tf.variable_scope(next_encoder_scope+"/latent"):
-                encoded_next_state_dist = GaussianEncoderDistribution(
-                    encoded_next_state,
-                    self.feature_size,
-                    reuse=reuse_encoder
-                )
-                encoded_next_state = encoded_next_state_dist.sample()
-            return encoded_state, encoded_next_state, encoded_state_dist, encoded_next_state_dist
-        else:
-            with tf.variable_scope("encoding"):
-                encoded_state = tf.layers.dense(
-                            encoded_state,
-                            self.feature_size,
-                            name="latent"
-                        )
-            with tf.variable_scope(next_encoder_scope):
-                encoded_next_state = tf.layers.dense(
-                            encoded_next_state,
-                            self.feature_size,
-                            name="latent",
-                            reuse=reuse_encoder
-                        )
-
-            return encoded_state, encoded_next_state
-
    def create_inverse_model(
        self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor, inverse_layers: int
    ) -> None:
        :param encoded_state: Tensor corresponding to encoded current state.
        :param encoded_next_state: Tensor corresponding to encoded next state.
        """
+        self.current_action = tf.stop_gradient(self.current_action)
        combined_input = tf.concat(
            [encoded_state, self.current_action], axis=1
        )
        )
        self.reward_loss = tf.clip_by_value(tf.reduce_mean(
            tf.squared_difference(self.pred_reward, self.current_reward)
-        ), 1e-10,1.0)
+        ), 1e-10, 1e10)
    
    def create_bisim_model(
        self, 
--- a/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/optimizer.py
-from typing import Optional, Any, Dict, cast
+from typing import Optional, Any, Dict, cast, List, Tuple
 import numpy as np
 import os
 import copy
+from mlagents.trainers.trajectory import SplitObservations
 from mlagents.trainers.policy.tf_policy import TFPolicy
 from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
 from mlagents.trainers.policy.transfer_policy import TransferPolicy
                    else:
                        self._create_dc_critic(h_size, hyperparameters.value_layers, vis_encode_type)
            
+            with tf.variable_scope("target_value"):
+                if policy.use_continuous_act:
+                    self._create_cc_critic_target(h_size, hyperparameters.value_layers, vis_encode_type)
+                else:
+                    self._create_dc_critic_target(h_size, hyperparameters.value_layers, vis_encode_type)
+
+            self._create_soft_critic_copy()
+            
            with tf.variable_scope("optimizer/"):
                self.learning_rate = ModelUtils.create_schedule(
                    self._schedule,
                    min_value=1e-10,
                )
                self.model_learning_rate = ModelUtils.create_schedule(
-                    # ScheduleType.LINEAR,
-                    ScheduleType.CONSTANT,
+                    self._schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
-                    ScheduleType.CONSTANT,
-                    lr/10,
+                    self._schedule,
+                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
-                    self.policy.targ_encoder,
+                    self.policy.next_encoder,
                    self.policy.predict,
                    beta,
                    epsilon,
            if self.use_transfer:
                self.policy.load_graph_partial(self.transfer_path, self.transfer_type, 
                    hyperparameters.load_model, hyperparameters.load_policy, hyperparameters.load_value)
+                self.run_soft_critic_copy()
            self.policy.get_encoder_weights()
            self.policy.get_policy_weights()

        )

    def _create_losses(
-        self, probs, old_probs, value_heads, entropy, targ_encoder, predict, 
+        self, probs, old_probs, value_heads, entropy, next_encoder, predict, 
         beta, epsilon, lr, max_step
    ):
        """
        # For cleaner stats reporting
        self.abs_policy_loss = tf.abs(self.policy_loss)

-        # encoder and predict loss
-        # self.dis_returns = tf.placeholder(
-        #     shape=[None], dtype=tf.float32, name="dis_returns"
-        # )
-        # target = tf.concat([targ_encoder, tf.expand_dims(self.dis_returns, -1)], axis=1)
-        # if self.predict_return:
-        #     self.model_loss = tf.reduce_mean(tf.squared_difference(predict, target)) 
-        # else:
-        #     self.model_loss = tf.reduce_mean(tf.squared_difference(predict, targ_encoder)) 
-        # if self.with_prior:
-        #     if self.use_var_encoder:
-        #         self.model_loss += encoder_distribution.kl_standard()
-        #     if self.use_var_predict:
-        #         self.model_loss += self.policy.predict_distribution.kl_standard()
-
        self.model_loss = self.policy.forward_loss
        if self.predict_return:
            self.model_loss += 0.5 * self.policy.reward_loss
                reward_diff = tf.reduce_mean(
                    tf.squared_difference(self.policy.bisim_pred_reward, self.policy.pred_reward)
                )
-                predict_diff = self.reward_signals["extrinsic"].gamma * predict_diff + tf.abs(reward_diff)
+                predict_diff = 0.99 * predict_diff + tf.abs(reward_diff)
-                tf.squared_difference(self.policy.encoder, self.policy.bisim_encoder)
+                tf.abs(self.policy.encoder - self.policy.bisim_encoder)
+            self.encode_dist_val = encode_dist
+            self.predict_diff_val = predict_diff
            self.bisim_loss = tf.squared_difference(encode_dist, predict_diff)

        self.loss = (
            update_vals = self._execute_model(feed_dict, self.update_dict)

        # update target encoder
-        if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
+        if self.num_updates % self.copy_every == 0:
+            self.run_soft_critic_copy()
            # print("copy")
            # self.policy.get_encoder_weights()

            update_vals = self._execute_model(feed_dict, self.model_only_update_dict)

        # update target encoder
-        if not self.reuse_encoder and self.num_updates % self.copy_every == 0:
+        if self.num_updates % self.copy_every == 0:
+            self.run_soft_critic_copy()
+            # print("copy")
+            # self.policy.get_encoder_weights()
+        self.num_updates += 1
        return update_stats

    def update_encoder(self, mini_batch1: AgentBuffer, mini_batch2: AgentBuffer):
        }
        
        update_vals = self._execute_model(feed_dict, self.bisim_update_dict)
+        # print("model difference:", self.policy.sess.run(self.predict_diff_val, feed_dict=feed_dict))
+        # print("encoder distance:", self.policy.sess.run(self.encode_dist_val, feed_dict=feed_dict))
        
        for stat_name, update_name in stats_needed.items():
            if update_name in update_vals.keys():
            keepdims=True,
        )

-    
+    def _create_cc_critic_target(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Continuous control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        input_state = self.policy.target_encoder
+
+        hidden_value = ModelUtils.create_vector_observation_encoder(
+            input_state,
+            h_size,
+            ModelUtils.swish,
+            num_layers,
+            scope=f"main_graph",
+            reuse=False
+        )
+        self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        
+        for name in self.stream_names:
+            self.target_value_heads[name] = tf.stop_gradient(self.target_value_heads[name])
+
+        self.target_all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        self.target_old_log_probs = tf.reduce_sum(
+            (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
+        )
+
+    def _create_dc_critic_target(
+        self, h_size: int, num_layers: int, vis_encode_type: EncoderType
+    ) -> None:
+        """
+        Creates Discrete control critic (value) network.
+        :param h_size: Size of hidden linear layers.
+        :param num_layers: Number of hidden linear layers.
+        :param vis_encode_type: The type of visual encoder to use.
+        """
+        input_state = self.policy.target_encoder
+
+        hidden_value = ModelUtils.create_vector_observation_encoder(
+            input_state,
+            h_size,
+            ModelUtils.swish,
+            num_layers,
+            scope=f"main_graph",
+            reuse=False
+        )
+        self.target_value_heads, self.target_value = ModelUtils.create_value_heads(
+            self.stream_names, hidden_value
+        )
+        for name in self.stream_names:
+            self.target_value_heads[name] = tf.stop_gradient(self.target_value_heads[name])
+            # self.target_value[name] = tf.stop_gradient(self.target_value[name])
+
+        self.target_all_old_log_probs = tf.placeholder(
+            shape=[None, sum(self.policy.act_size)],
+            dtype=tf.float32,
+            name="old_probabilities",
+        )
+
+        # Break old log probs into separate branches
+        old_log_prob_branches = ModelUtils.break_into_branches(
+            self.target_all_old_log_probs, self.policy.act_size
+        )
+
+        _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
+            old_log_prob_branches, self.policy.action_masks, self.policy.act_size
+        )
+
+        action_idx = [0] + list(np.cumsum(self.policy.act_size))
+
+        self.target_old_log_probs = tf.reduce_sum(
+            (
+                tf.stack(
+                    [
+                        -tf.nn.softmax_cross_entropy_with_logits_v2(
+                            labels=self.policy.selected_actions[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                            logits=old_normalized_logits[
+                                :, action_idx[i] : action_idx[i + 1]
+                            ],
+                        )
+                        for i in range(len(self.policy.act_size))
+                    ],
+                    axis=1,
+                )
+            ),
+            axis=1,
+            keepdims=True,
+        )
+
+    def get_trajectory_value_estimates(
+        self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
+        feed_dict: Dict[tf.Tensor, Any] = {
+            self.policy.batch_size_ph: batch.num_experiences,
+            self.policy.sequence_length_ph: batch.num_experiences,  # We want to feed data in batch-wise, not time-wise.
+        }
+
+        if self.policy.vec_obs_size > 0:
+            feed_dict[self.policy.vector_in] = batch["vector_obs"]
+        if self.policy.vis_obs_size > 0:
+            for i in range(len(self.policy.visual_in)):
+                _obs = batch["visual_obs%d" % i]
+                feed_dict[self.policy.visual_in[i]] = _obs
+        if self.policy.use_recurrent:
+            feed_dict[self.policy.memory_in] = [
+                np.zeros((self.policy.m_size), dtype=np.float32)
+            ]
+            feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
+        if self.policy.prev_action is not None:
+            feed_dict[self.policy.prev_action] = batch["prev_action"]
+
+        if self.policy.use_recurrent:
+            value_estimates, policy_mem, value_mem = self.sess.run(
+                [self.target_value_heads, self.policy.memory_out, self.memory_out], feed_dict
+            )
+            prev_action = (
+                batch["actions"][-1] if not self.policy.use_continuous_act else None
+            )
+        else:
+            value_estimates = self.sess.run(self.target_value_heads, feed_dict)
+            prev_action = None
+            policy_mem = None
+            value_mem = None
+        value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
+
+        # We do this in a separate step to feed the memory outs - a further optimization would
+        # be to append to the obs before running sess.run.
+        final_value_estimates = self._get_value_estimates(
+            next_obs, done, policy_mem, value_mem, prev_action
+        )
+
+        return value_estimates, final_value_estimates
+
+    def _create_soft_critic_copy(self):
+        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_value')
+        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='value')
+
+        with tf.variable_scope('hard_replacement'):
+            self.target_replace_op = [tf.assign(t, 0.9*t + 0.1*e) for t, e in zip(t_params, e_params)]
+
+    def run_soft_critic_copy(self):
+        with self.policy.graph.as_default():
+            # print("before")
+            # val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "value/extrinsic_value/bias:0")
+            # print("value:", self.sess.run(val))
+            # target_val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_value/extrinsic_value/bias:0")
+            # print("target_val:", self.sess.run(target_val))
+            self.policy.sess.run(self.target_replace_op)
+            # print("copy")
+            # val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "value/extrinsic_value/bias:0")
+            # print("value:", self.sess.run(val))
+            # target_val = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target_value/extrinsic_value/bias:0")
+            # print("target_val:", self.sess.run(target_val))
--- a/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo_transfer/trainer.py
        size_of_buffer = self.update_buffer.num_experiences
        return size_of_buffer > self.hyperparameters.buffer_size

-    def _update_policy_old(self):
+    def _update_policy(self):
        """
        Uses demonstration_buffer to update the policy.
        The reward signal generators must be updated in this method at their own pace.
                    )
                    for stat_name, value in update_stats.items():
                        batch_update_stats[stat_name].append(value)
-                # if self.use_bisim:
-                #     self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
-                #     buffer1 = copy.deepcopy(self.update_buffer)
-                #     self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
-                #     buffer2 = copy.deepcopy(self.update_buffer)
-                #     max_num_batch = buffer_length // batch_size
-                #     for i in range(0, max_num_batch * batch_size, batch_size):
-                #         update_stats = self.optimizer.update_encoder(
-                #             buffer1.make_mini_batch(i, i + batch_size), 
-                #             buffer2.make_mini_batch(i, i + batch_size), 
-                #         )
-                #         for stat_name, value in update_stats.items():
-                #             batch_update_stats[stat_name].append(value)
+                if self.use_bisim:
+                    self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
+                    buffer1 = copy.deepcopy(self.update_buffer)
+                    self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
+                    buffer2 = copy.deepcopy(self.update_buffer)
+                    max_num_batch = buffer_length // batch_size
+                    for i in range(0, max_num_batch * batch_size, batch_size):
+                        update_stats = self.optimizer.update_encoder(
+                            buffer1.make_mini_batch(i, i + batch_size), 
+                            buffer2.make_mini_batch(i, i + batch_size), 
+                        )
+                        for stat_name, value in update_stats.items():
+                            batch_update_stats[stat_name].append(value)
            else:
                self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
                buffer = self.update_buffer
        
        return True
    
-    def _update_policy(self):
+    def _update_policy_new(self):
        """
        Uses demonstration_buffer to update the policy.
        The reward signal generators must be updated in this method at their own pace.
--- a/ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
+++ b/ml-agents/mlagents/trainers/tests/encoder_plot.ipynb
--- a/ml-agents/mlagents/trainers/tests/transfer_test_envs.py
+++ b/ml-agents/mlagents/trainers/tests/transfer_test_envs.py
            self.action[name] = None
            self.step_result[name] = None
            self.step_count[name] = 0
-            self.horizon[name] = 1000
+            self.horizon[name] = 5000
        print(self.goal)

    def _make_obs_spec(self) -> List[Any]:
            for _ in range(self.num_vector):
                obs_spec.append((self.vec_obs_size,))
        # composed position
-        if "rich" in self.obs_spec_type:
+        elif "rich" in self.obs_spec_type:
            for _ in range(self.num_vector+1):
                obs_spec.append((self.vec_obs_size,))
        print("obs_spec:", obs_spec)
        obs = []
+        if self.obs_spec_type == "compact":
+            for name in self.names:
+                for pos, goal in zip(self.positions[name], value):
+                    obs.append(np.ones((1, self.vec_obs_size), dtype=np.float32) * (goal-pos))
+            return obs
+
+            
        if self.obs_spec_type == "normal":
            for name in self.names:
                for i in self.positions[name]:
        if self.goal_type == "easy":
            done = all(pos >= 1.0 or pos <= -1.0 for pos in self.positions[name]) or self.step_count[name] >= self.horizon[name]
        elif self.goal_type == "hard":
-            done = self.step_count[name] >= self.horizon[name]
-            # done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
-            #      or self.step_count[name] >= self.horizon[name]
+            # done = self.step_count[name] >= self.horizon[name]
+            done = all(abs(pos-goal) <= 0.1 for pos, goal in zip(self.positions[name], self.goal[name])) \
+                 or self.step_count[name] >= self.horizon[name]
        # if done:
        #     print(self.positions[name], end=" done ")
        return done

    def _compute_reward(self, name: str, done: bool) -> float:
        reward = 0.0
-        for _pos, goal in zip(self.positions[name], self.goal[name]):
-        #     if abs(_pos - self.goal[name]) < 0.1:
-        #         reward += SUCCESS_REWARD
-        #     else:
-        #         reward -= TIME_PENALTY
-            reward += 2 - abs(_pos - goal) #np.exp(-abs(_pos - goal))
+        # for _pos, goal in zip(self.positions[name], self.goal[name]):
+        # #     if abs(_pos - self.goal[name]) < 0.1:
+        # #         reward += SUCCESS_REWARD
+        # #     else:
+        # #         reward -= TIME_PENALTY
+        #     reward -= abs(_pos - goal) #np.exp(-abs(_pos - goal))
-        # if done:
-        #     reward = SUCCESS_REWARD
-        #     # for _pos in self.positions[name]:
-        #     #     if self.goal_type == "easy":
-        #     #         reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
-        #     #             self.positions[name]
-        #     #         )
-        #     #     elif self.goal_type == "hard":
-        #     #         reward += np.exp(-abs(_pos - self.goal[name]))
-        # else:
-        #     reward = -TIME_PENALTY
+        if done and self.step_count[name] < self.horizon[name]:
+            reward = SUCCESS_REWARD
+            # for _pos in self.positions[name]:
+            #     if self.goal_type == "easy":
+            #         reward += (SUCCESS_REWARD * _pos * self.goal[name]) / len(
+            #             self.positions[name]
+            #         )
+            #     elif self.goal_type == "hard":
+            #         reward += np.exp(-abs(_pos - self.goal[name]))
+        else:
+            reward = -TIME_PENALTY

        return reward