void AgentAWins()
{
m_AgentA.SetReward(.1f);
m_AgentB.SetReward(-.1f);// - m_AgentB.timePenalty);
m_AgentA.SetReward(1f);
m_AgentB.SetReward(-1f);// - m_AgentB.timePenalty);
m_AgentA.score += 1;
Reset();
m_AgentA.SetReward(-.1f);// - m_AgentA.timePenalty);
m_AgentB.SetReward(.1f);
m_AgentA.SetReward(-1f);// - m_AgentA.timePenalty);
m_AgentB.SetReward(1f);
m_AgentB.score += 1;
batch_size: 2048
buffer_size: 20480
hidden_units: 256
beta: 1.0e-2
beta: 2.0e-2
time_horizon: 1000
self_play:
window: 10