浏览代码

check for numpy float64 (#2948)

/develop/tanhsquash
GitHub 5 年前
当前提交
652488d9
共有 21 个文件被更改,包括 167 次插入80 次删除
  1. 4
      .circleci/config.yml
  2. 4
      gym-unity/gym_unity/tests/test_gym.py
  3. 20
      ml-agents-envs/mlagents/envs/brain.py
  4. 6
      ml-agents-envs/mlagents/envs/tests/test_brain.py
  5. 7
      ml-agents/mlagents/trainers/bc/policy.py
  6. 10
      ml-agents/mlagents/trainers/buffer.py
  7. 5
      ml-agents/mlagents/trainers/components/bc/module.py
  8. 7
      ml-agents/mlagents/trainers/components/reward_signals/__init__.py
  9. 2
      ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
  10. 8
      ml-agents/mlagents/trainers/demo_loader.py
  11. 8
      ml-agents/mlagents/trainers/ppo/trainer.py
  12. 2
      ml-agents/mlagents/trainers/rl_trainer.py
  13. 49
      ml-agents/mlagents/trainers/tests/__init__.py
  14. 32
      ml-agents/mlagents/trainers/tests/mock_brain.py
  15. 12
      ml-agents/mlagents/trainers/tests/test_bc.py
  16. 6
      ml-agents/mlagents/trainers/tests/test_policy.py
  17. 33
      ml-agents/mlagents/trainers/tests/test_ppo.py
  18. 2
      ml-agents/mlagents/trainers/tests/test_reward_signals.py
  19. 8
      ml-agents/mlagents/trainers/tests/test_rl_trainer.py
  20. 18
      ml-agents/mlagents/trainers/tests/test_sac.py
  21. 4
      ml-agents/mlagents/trainers/tf_policy.py

4
.circleci/config.yml


executor: << parameters.executor >>
working_directory: ~/repo
# Run additional numpy checks on unit tests
environment:
TEST_ENFORCE_NUMPY_FLOAT32: 1
steps:
- checkout

4
gym-unity/gym_unity/tests/test_gym.py


mock_braininfo = mock.Mock()
mock_braininfo.return_value.vector_observations = np.array([num_agents * [1, 2, 3]])
if number_visual_observations:
mock_braininfo.return_value.visual_observations = [[np.zeros(shape=(8, 8, 3))]]
mock_braininfo.return_value.visual_observations = [
[np.zeros(shape=(8, 8, 3), dtype=np.float32)]
]
mock_braininfo.return_value.rewards = num_agents * [1.0]
mock_braininfo.return_value.local_done = num_agents * [False]
mock_braininfo.return_value.agents = range(0, num_agents)

20
ml-agents-envs/mlagents/envs/brain.py


@staticmethod
def merge_memories(m1, m2, agents1, agents2):
if len(m1) == 0 and len(m2) != 0:
m1 = np.zeros((len(agents1), m2.shape[1]))
m1 = np.zeros((len(agents1), m2.shape[1]), dtype=np.float32)
m2 = np.zeros((len(agents2), m1.shape[1]))
m2 = np.zeros((len(agents2), m1.shape[1]), dtype=np.float32)
new_m1 = np.zeros((m1.shape[0], m2.shape[1]))
new_m1 = np.zeros((m1.shape[0], m2.shape[1]), dtype=np.float32)
new_m2 = np.zeros((m2.shape[0], m1.shape[1]))
new_m2 = np.zeros((m2.shape[0], m1.shape[1]), dtype=np.float32)
new_m2[0 : m2.shape[0], 0 : m2.shape[1]] = m2
return np.append(m1, new_m2, axis=0)
return np.append(m1, m2, axis=0)

vis_obs = BrainInfo._process_visual_observations(brain_params, agent_info_list)
total_num_actions = sum(brain_params.vector_action_space_size)
mask_actions = np.ones((len(agent_info_list), total_num_actions))
mask_actions = np.ones(
(len(agent_info_list), total_num_actions), dtype=np.float32
)
for agent_index, agent_info in enumerate(agent_info_list):
if agent_info.action_mask is not None:
if len(agent_info.action_mask) == total_num_actions:

brain_params: BrainParameters, agent_info_list: List[AgentInfoProto]
) -> np.ndarray:
if len(agent_info_list) == 0:
vector_obs = np.zeros((0, brain_params.vector_observation_space_size))
vector_obs = np.zeros(
(0, brain_params.vector_observation_space_size), dtype=np.float32
)
else:
stacked_obs = []
has_nan = False

for vo in vec_obs:
# TODO consider itertools.chain here
proto_vector_obs.extend(vo.float_data.data)
np_obs = np.array(proto_vector_obs)
np_obs = np.array(proto_vector_obs, dtype=np.float32)
# Check for NaNs or infs in the observations
# If there's a NaN in the observations, the dot() result will be NaN

has_nan = has_nan or np.isnan(d)
has_inf = has_inf or not np.isfinite(d)
stacked_obs.append(np_obs)
vector_obs = np.array(stacked_obs)
vector_obs = np.array(stacked_obs, dtype=np.float32)
# In we have any NaN or Infs, use np.nan_to_num to replace these with finite values
if has_nan or has_inf:

6
ml-agents-envs/mlagents/envs/tests/test_brain.py


from typing import List
import logging
import numpy as np
import sys
from unittest import mock
from mlagents.envs.communicator_objects.agent_info_pb2 import AgentInfoProto

agent_info_proto = _make_agent_info_proto([1.0, float("inf"), 0.0])
brain_info = BrainInfo.from_agent_proto(1, [agent_info_proto], test_brain)
# inf should get set to float_max
expected = [1.0, sys.float_info.max, 0.0]
# inf should get set to float32_max
float32_max = np.finfo(np.float32).max
expected = [1.0, float32_max, 0.0]
assert (brain_info.vector_observations == expected).all()
mock_nan_to_num.assert_called()
# We don't warn on inf, just NaN

7
ml-agents/mlagents/trainers/bc/policy.py


else:
feed_dict[self.model.true_action] = mini_batch["actions"]
feed_dict[self.model.action_masks] = np.ones(
(num_sequences, sum(self.brain.vector_action_space_size))
(num_sequences, sum(self.brain.vector_action_space_size)),
dtype=np.float32,
)
if self.use_vec_obs:
feed_dict[self.model.vector_in] = mini_batch["vector_obs"]

if self.use_recurrent:
feed_dict[self.model.memory_in] = np.zeros([num_sequences, self.m_size])
feed_dict[self.model.memory_in] = np.zeros(
[num_sequences, self.m_size], dtype=np.float32
)
run_out = self._execute_model(feed_dict, self.update_dict)
return run_out

10
ml-agents/mlagents/trainers/buffer.py


Sets the list of np.array to the input data
:param data: The np.array list to be set.
"""
# Make sure we convert incoming data to float32 if it's a float
dtype = None
if data is not None and len(data) and isinstance(data[0], float):
dtype = np.float32
self[:] = list(np.array(data))
self[:] = list(np.array(data, dtype=dtype))
def get_batch(self, batch_size=None, training_length=1, sequential=True):
"""

" too large given the current number of data points."
)
if batch_size * training_length > len(self):
padding = np.array(self[-1]) * self.padding_value
padding = (
np.array(self[-1], dtype=np.float32) * self.padding_value
)
return np.array(
[padding] * (training_length - leftover) + self[:],
dtype=np.float32,

5
ml-agents/mlagents/trainers/components/bc/module.py


(
self.n_sequences * self.policy.sequence_length,
sum(self.policy.model.brain.vector_action_space_size),
)
),
dtype=np.float32,
)
if self.policy.model.brain.vector_observation_space_size > 0:
feed_dict[self.policy.model.vector_in] = mini_batch_demo["vector_obs"]

]
if self.use_recurrent:
feed_dict[self.policy.model.memory_in] = np.zeros(
[self.n_sequences, self.policy.m_size]
[self.n_sequences, self.policy.m_size], dtype=np.float32
)
if not self.policy.model.brain.vector_action_space_type == "continuous":
feed_dict[self.policy.model.prev_action] = mini_batch_demo[

7
ml-agents/mlagents/trainers/components/reward_signals/__init__.py


:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
return RewardSignalResult(
self.strength * np.zeros(len(current_info.agents)),
np.zeros(len(current_info.agents)),
self.strength * np.zeros(len(current_info.agents), dtype=np.float32),
np.zeros(len(current_info.agents), dtype=np.float32),
)
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:

"""
mini_batch_len = len(next(iter(mini_batch.values())))
return RewardSignalResult(
self.strength * np.zeros(mini_batch_len), np.zeros(mini_batch_len)
self.strength * np.zeros(mini_batch_len, dtype=np.float32),
np.zeros(mini_batch_len, dtype=np.float32),
)
def prepare_update(

2
ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py


:param next_info: The BrainInfo from the next timestep.
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
"""
unscaled_reward = np.array(next_info.rewards)
unscaled_reward = np.array(next_info.rewards, dtype=np.float32)
scaled_reward = self.strength * unscaled_reward
return RewardSignalResult(scaled_reward, unscaled_reward)

8
ml-agents/mlagents/trainers/demo_loader.py


next_brain_info = BrainInfo.from_agent_proto(
0, [next_pair_info.agent_info], brain_params
)
previous_action = np.array(pair_infos[idx].action_info.vector_actions) * 0
previous_action = (
np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
)
previous_action = np.array(pair_infos[idx - 1].action_info.vector_actions)
previous_action = np.array(
pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
)
demo_buffer[0].last_brain_info = current_brain_info
demo_buffer[0]["done"].append(next_brain_info.local_done[0])
demo_buffer[0]["rewards"].append(next_brain_info.rewards[0])

8
ml-agents/mlagents/trainers/ppo/trainer.py


tmp_advantages.append(local_advantage)
tmp_returns.append(local_return)
global_advantages = list(np.mean(np.array(tmp_advantages), axis=0))
global_returns = list(np.mean(np.array(tmp_returns), axis=0))
global_advantages = list(
np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
)
global_returns = list(
np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)
)
self.training_buffer[agent_id]["advantages"].set(global_advantages)
self.training_buffer[agent_id]["discounted_returns"].set(global_returns)

2
ml-agents/mlagents/trainers/rl_trainer.py


curr_to_use, take_action_outputs["action"], next_info
)
# Store the environment reward
tmp_environment = np.array(next_info.rewards)
tmp_environment = np.array(next_info.rewards, dtype=np.float32)
rewards_out = AllRewardsOutput(
reward_signals=tmp_reward_signal_outs, environment=tmp_environment

49
ml-agents/mlagents/trainers/tests/__init__.py


import os
# Opt-in checking mode to ensure that we always create numpy arrays using float32
if os.getenv("TEST_ENFORCE_NUMPY_FLOAT32"):
# This file is importer by pytest multiple times, but this breaks the patching
# Removing the env variable seems the easiest way to prevent this.
del os.environ["TEST_ENFORCE_NUMPY_FLOAT32"]
import numpy as np
import traceback
__old_np_array = np.array
__old_np_zeros = np.zeros
__old_np_ones = np.ones
def _check_no_float64(arr, kwargs_dtype):
if arr.dtype == np.float64:
tb = traceback.extract_stack()
# tb[-1] in the stack is this function.
# tb[-2] is the wrapper function, e.g. np_array_no_float64
# we want the calling function, so use tb[-3]
filename = tb[-3].filename
# Only raise if this came from mlagents code, not tensorflow
if (
"ml-agents/mlagents" in filename
or "ml-agents-envs/mlagents" in filename
) and "tensorflow_to_barracuda.py" not in filename:
raise ValueError(
f"float64 array created. Set dtype=np.float32 instead of current dtype={kwargs_dtype}. "
f"Run pytest with TEST_ENFORCE_NUMPY_FLOAT32=1 to confirm fix."
)
def np_array_no_float64(*args, **kwargs):
res = __old_np_array(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
def np_zeros_no_float64(*args, **kwargs):
res = __old_np_zeros(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
def np_ones_no_float64(*args, **kwargs):
res = __old_np_ones(*args, **kwargs)
_check_no_float64(res, kwargs.get("dtype"))
return res
np.array = np_array_no_float64
np.zeros = np_zeros_no_float64
np.ones = np_ones_no_float64

32
ml-agents/mlagents/trainers/tests/mock_brain.py


mock_braininfo = mock.Mock()
mock_braininfo.return_value.visual_observations = num_vis_observations * [
np.ones((num_agents, 84, 84, 3))
np.ones((num_agents, 84, 84, 3), dtype=np.float32)
num_agents * [num_vector_observations * [1]]
num_agents * [num_vector_observations * [1]], dtype=np.float32
num_agents * [num_discrete_branches * [0.5]]
num_agents * [num_discrete_branches * [0.5]], dtype=np.float32
num_agents * [num_vector_acts * [1.0]]
num_agents * [num_vector_acts * [1.0]], dtype=np.float32
num_agents * [num_vector_acts * [0.5]]
num_agents * [num_vector_acts * [0.5]], dtype=np.float32
mock_braininfo.return_value.memories = np.ones((num_agents, 8))
mock_braininfo.return_value.memories = np.ones((num_agents, 8), dtype=np.float32)
mock_braininfo.return_value.rewards = num_agents * [1.0]
mock_braininfo.return_value.local_done = num_agents * [False]
mock_braininfo.return_value.max_reached = num_agents * [100]

fake_action_size = len(brain_params.vector_action_space_size)
if brain_params.vector_action_space_type == "continuous":
fake_action_size = brain_params.vector_action_space_size[0]
buffer[0]["actions"].append(np.zeros(fake_action_size))
buffer[0]["prev_action"].append(np.zeros(fake_action_size))
buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
np.ones(sum(brain_params.vector_action_space_size))
np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32)
buffer[0]["action_probs"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["actions_pre"].append(np.ones(buffer[0]["actions"][0].shape))
buffer[0]["action_probs"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
)
buffer[0]["actions_pre"].append(
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
)
np.ones(buffer[0]["actions"][0].shape)
np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
np.ones(np.sum(brain_params.vector_action_space_size))
np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32)
buffer[0]["memory"].append(np.ones(memory_size))
buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32))
buffer.append_update_buffer(0, batch_size=None, training_length=sequence_length)
return buffer

12
ml-agents/mlagents/trainers/tests/test_bc.py


model.dropout_rate: 1.0,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.dropout_rate: 1.0,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

6
ml-agents/mlagents/trainers/tests/test_policy.py


test_seed = 3
policy = TFPolicy(test_seed, basic_mock_brain(), basic_params())
policy_eval_out = {
"action": np.array([1.0]),
"memory_out": np.array([[2.5]]),
"value": np.array([1.1]),
"action": np.array([1.0], dtype=np.float32),
"memory_out": np.array([[2.5]], dtype=np.float32),
"value": np.array([1.1], dtype=np.float32),
}
policy.evaluate = MagicMock(return_value=policy_eval_out)
brain_info_with_agents = BrainInfo(

33
ml-agents/mlagents/trainers/tests/test_ppo.py


model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.epsilon: np.array([[0, 1], [2, 3]]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.epsilon: np.array([[0, 1], [2, 3]], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [[0], [0]],
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.action_masks: np.ones([1, 2]),
model.action_masks: np.ones([1, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.epsilon: np.array([[0, 1]]),
}

def test_rl_functions():
rewards = np.array([0.0, 0.0, 0.0, 1.0])
rewards = np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32)
np.testing.assert_array_almost_equal(returns, np.array([0.729, 0.81, 0.9, 1.0]))
np.testing.assert_array_almost_equal(
returns, np.array([0.729, 0.81, 0.9, 1.0], dtype=np.float32)
)
def test_trainer_increment_step(dummy_config):

rewardsout = AllRewardsOutput(
reward_signals={
"extrinsic": RewardSignalResult(
scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])
scaled_reward=np.array([1.0, 1.0], dtype=np.float32),
unscaled_reward=np.array([1.0, 1.0], dtype=np.float32),
environment=np.array([1.0, 1.0]),
environment=np.array([1.0, 1.0], dtype=np.float32),
values = {"extrinsic": np.array([[2.0]])}
values = {"extrinsic": np.array([[2.0]], dtype=np.float32)}
agent_id = "123"
idx = 0
# make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.

2
ml-agents/mlagents/trainers/tests/test_reward_signals.py


brain_info = brain_infos[env.external_brain_names[0]]
next_brain_info = env.step()[env.external_brain_names[0]]
# Test evaluate
action = np.ones((len(brain_info.agents), policy.num_branches))
action = np.ones((len(brain_info.agents), policy.num_branches), dtype=np.float32)
rsig_result = policy.reward_signals[reward_signal_name].evaluate(
brain_info, action, next_brain_info
)

8
ml-agents/mlagents/trainers/tests/test_rl_trainer.py


def create_mock_policy():
mock_policy = mock.Mock()
mock_policy.reward_signals = {}
mock_policy.retrieve_memories.return_value = np.zeros((1, 1))
mock_policy.retrieve_previous_action.return_value = np.zeros((1, 1))
mock_policy.retrieve_memories.return_value = np.zeros((1, 1), dtype=np.float32)
mock_policy.retrieve_previous_action.return_value = np.zeros(
(1, 1), dtype=np.float32
)
return mock_policy

fake_action_outputs = {
"action": [0.1, 0.1],
"value_heads": {},
"entropy": np.array([1.0]),
"entropy": np.array([1.0], dtype=np.float32),
"learning_rate": 1.0,
}
mock_braininfo = mb.create_mock_braininfo(

18
ml-agents/mlagents/trainers/tests/test_sac.py


model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.visual_in[0]: np.ones([2, 40, 30, 3]),
model.visual_in[1]: np.ones([2, 40, 30, 3]),
model.action_masks: np.ones([2, 2]),
model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 2,
model.sequence_length: 1,
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
model.action_masks: np.ones([2, 2]),
model.action_masks: np.ones([2, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

model.batch_size: 1,
model.sequence_length: 2,
model.prev_action: [[0], [0]],
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.action_masks: np.ones([1, 2]),
model.action_masks: np.ones([1, 2], dtype=np.float32),
}
sess.run(run_list, feed_dict=feed_dict)

feed_dict = {
model.batch_size: 1,
model.sequence_length: 2,
model.memory_in: np.zeros((1, memory_size)),
model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]),
}
sess.run(run_list, feed_dict=feed_dict)

4
ml-agents/mlagents/trainers/tf_policy.py


:param num_agents: Number of agents.
:return: Numpy array of zeros.
"""
return np.zeros((num_agents, self.m_size), dtype=np.float)
return np.zeros((num_agents, self.m_size), dtype=np.float32)
def save_memories(
self, agent_ids: List[int], memory_matrix: Optional[np.ndarray]

self.memory_dict[agent_id] = memory_matrix[index, :]
def retrieve_memories(self, agent_ids: List[int]) -> np.ndarray:
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float)
memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
for index, agent_id in enumerate(agent_ids):
if agent_id in self.memory_dict:
memory_matrix[index, :] = self.memory_dict[agent_id]

正在加载...
取消
保存