Use Barracuda tensors and Barracuda 0.2.4 (#2308)

Bringing bucket of temp memory allocation optimizations: * switched to Barracuda backed tensor across the board, helps to leverage allocators and reuse of the internal buffers * added Barracuda 0.2.4 release, which bring another set of temp memory allocation fixes
6 年前 · 9ea7fea8
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorApplier.cs
 using NUnit.Framework;
 using UnityEngine;
 using System.Reflection;
+using Barracuda;
 using MLAgents.InferenceBrain;

 namespace MLAgents.Tests
        public void Contruction()
        {
            var bp = new BrainParameters();
-            var tensorGenerator = new TensorApplier(bp, 0);
+            var tensorGenerator = new TensorApplier(bp, 0, new TensorCachingAllocator());
            Assert.IsNotNull(tensorGenerator);
        }

-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                Data = new float[,] {{1, 2, 3}, {4, 5, 6}}
+                Data = new Tensor (2, 3, new float[] {1, 2, 3, 
+                                                                4, 5, 6})
            };
            var agentInfos = GetFakeAgentInfos();
            
        [Test]
        public void ApplyDiscreteActionOutput()
        {
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                Data = new float[,] {{0.5f, 22.5f, 0.1f, 5f, 1f},
-                    {4f, 5f, 6f, 7f, 8f}}
+                Data = new Tensor (2, 5, new[] {0.5f, 22.5f, 0.1f, 5f, 1f,
+                                                                4f, 5f, 6f, 7f, 8f})
-            var applier = new DiscreteActionOutputApplier(new int[]{2, 3}, 0);
+            var applier = new DiscreteActionOutputApplier(new int[]{2, 3}, 0, new TensorCachingAllocator());
            applier.Apply(inputTensor, agentInfos);
            var agents = agentInfos.Keys.ToList();
            var agent = agents[0] as TestAgent;
        [Test]
        public void ApplyMemoryOutput()
        {
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                Data = new float[,] {{0.5f, 22.5f, 0.1f, 5f, 1f},
-                    {4f, 5f, 6f, 7f, 8f}}
+                Data = new Tensor (2, 5, new[] {0.5f, 22.5f, 0.1f, 5f, 1f,
+                                                          4f, 5f, 6f, 7f, 8f})
            };
            var agentInfos = GetFakeAgentInfos();
            
        [Test]
        public void ApplyValueEstimate()
        {
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                Data = new float[,] {{0.5f}, {8f}}
+                Data = new Tensor (2, 1, new[]{0.5f, 8f})
            };
            var agentInfos = GetFakeAgentInfos();
            
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/EditModeTestInternalBrainTensorGenerator.cs
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using Barracuda;
 using NUnit.Framework;
 using UnityEngine;
 using MLAgents.InferenceBrain;
        public void Contruction()
        {
            var bp = new BrainParameters();
-            var tensorGenerator = new TensorGenerator(bp, 0);
+            var tensorGenerator = new TensorGenerator(bp, 0, new TensorCachingAllocator());
            Assert.IsNotNull(tensorGenerator);
        }

-            var inputTensor = new Tensor();
+            var inputTensor = new TensorProxy();
-            var generator = new BatchSizeGenerator();
+            var generator = new BatchSizeGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as int[]);
-            Assert.AreEqual((inputTensor.Data as int[])[0], batchSize);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0], batchSize);
-            var inputTensor = new Tensor();
+            var inputTensor = new TensorProxy();
-            var generator = new SequenceLengthGenerator();
+            var generator = new SequenceLengthGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as int[]);
-            Assert.AreEqual((inputTensor.Data as int[])[0], 1);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0], 1);
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
            {
                Shape = new long[] {2, 3}
            };
-            var generator = new VectorObservationGenerator();
+            var generator = new VectorObservationGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as float[,]);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 1);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 2], 3);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 4);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 2], 6);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0, 0], 1);
+            Assert.AreEqual(inputTensor.Data[0, 2], 3);
+            Assert.AreEqual(inputTensor.Data[1, 0], 4);
+            Assert.AreEqual(inputTensor.Data[1, 2], 6);
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
            {
                Shape = new long[] {2, 5}
            };
-            var generator = new RecurrentInputGenerator();
+            var generator = new RecurrentInputGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as float[,]);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 0);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 4], 0);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 1);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 4], 0);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0, 0], 0);
+            Assert.AreEqual(inputTensor.Data[0, 4], 0);
+            Assert.AreEqual(inputTensor.Data[1, 0], 1);
+            Assert.AreEqual(inputTensor.Data[1, 4], 0);
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                ValueType = Tensor.TensorType.Integer
+                ValueType = TensorProxy.TensorType.Integer
-            var generator = new PreviousActionInputGenerator();
+            var generator = new PreviousActionInputGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as int[,]);
-            Assert.AreEqual((inputTensor.Data as int[,])[0, 0], 1);
-            Assert.AreEqual((inputTensor.Data as int[,])[0, 1], 2);
-            Assert.AreEqual((inputTensor.Data as int[,])[1, 0], 3);
-            Assert.AreEqual((inputTensor.Data as int[,])[1, 1], 4);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0, 0], 1);
+            Assert.AreEqual(inputTensor.Data[0, 1], 2);
+            Assert.AreEqual(inputTensor.Data[1, 0], 3);
+            Assert.AreEqual(inputTensor.Data[1, 1], 4);
-            var inputTensor = new Tensor()
+            var inputTensor = new TensorProxy()
-                ValueType = Tensor.TensorType.FloatingPoint
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            var generator = new ActionMaskInputGenerator();
+            var generator = new ActionMaskInputGenerator(new TensorCachingAllocator());
-            Assert.IsNotNull(inputTensor.Data as float[,]);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 0], 1);
-            Assert.AreEqual((inputTensor.Data as float[,])[0, 4], 1);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 0], 0);
-            Assert.AreEqual((inputTensor.Data as float[,])[1, 4], 1);
+            Assert.IsNotNull(inputTensor.Data);
+            Assert.AreEqual(inputTensor.Data[0, 0], 1);
+            Assert.AreEqual(inputTensor.Data[0, 4], 1);
+            Assert.AreEqual(inputTensor.Data[1, 0], 0);
+            Assert.AreEqual(inputTensor.Data[1, 4], 1);
        }
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/MultinomialTest.cs
 using System;
+using Barracuda;
 using NUnit.Framework;
 using UnityEngine;
 using MLAgents.InferenceBrain;
        {
            Multinomial m = new Multinomial(2018);

-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                Data = new float[1, 3] {{0.1f, 0.2f, 0.7f}},
-                ValueType = Tensor.TensorType.FloatingPoint
+                Data = new Tensor(1, 3, new[] {0.1f, 0.2f, 0.7f}),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                Data = new float[1, 3],
-                ValueType = Tensor.TensorType.FloatingPoint
+                Data = new Tensor(1, 3),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            int i = 0;
-            foreach (var f in dst.Data)
+            for (var i = 0; i < dst.Data.length; i++)
-                Assert.AreEqual(reference[i], f);
+                Assert.AreEqual(reference[i], dst.Data[i]);
                ++i;
            }
        }
        {
            Multinomial m = new Multinomial(2018);

-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                Data = new float[1, 3] {{Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50}},
-                ValueType = Tensor.TensorType.FloatingPoint
+                Data = new Tensor(1, 3, new[] {Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50}),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                Data = new float[1, 3],
-                ValueType = Tensor.TensorType.FloatingPoint
+                Data = new Tensor(1, 3),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            int i = 0;
-            foreach (var f in dst.Data)
+            for (var i = 0; i < dst.Data.length; i++)
-                Assert.AreEqual(reference[i], f);
+                Assert.AreEqual(reference[i], dst.Data[i]);
                ++i;
            }
        }
        {
            Multinomial m = new Multinomial(2018);

-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                Data = new float[2, 3]
+                Data = new Tensor(2, 3, new []
-                    {Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50},
-                    {Mathf.Log(0.3f) - 25, Mathf.Log(0.4f) - 25, Mathf.Log(0.3f) - 25},
+                    Mathf.Log(0.1f) - 50, Mathf.Log(0.2f) - 50, Mathf.Log(0.7f) - 50,
+                    Mathf.Log(0.3f) - 25, Mathf.Log(0.4f) - 25, Mathf.Log(0.3f) - 25
-                },
-                ValueType = Tensor.TensorType.FloatingPoint
+                }),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                Data = new float[2, 3],
-                ValueType = Tensor.TensorType.FloatingPoint
+                Data = new Tensor(2, 3),
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            int i = 0;
-            foreach (var f in dst.Data)
+            for (var i = 0; i < dst.Data.length; i++)
-                Assert.AreEqual(reference[i], f);
+                Assert.AreEqual(reference[i], dst.Data[i]);
                ++i;
            }
        }
        {
            Multinomial m = new Multinomial(2018);

-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.Integer
+                ValueType = TensorProxy.TensorType.Integer
            };

            Assert.Throws<NotImplementedException>(() => m.Eval(src, null));
        {
            Multinomial m = new Multinomial(2018);

-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                ValueType = Tensor.TensorType.Integer
+                ValueType = TensorProxy.TensorType.Integer
            };

            Assert.Throws<ArgumentException>(() => m.Eval(src, dst));
        {
            Multinomial m = new Multinomial(2018);
            
-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint
+                ValueType = TensorProxy.TensorType.FloatingPoint
            };

            Assert.Throws<ArgumentNullException>(() => m.Eval(src, dst));
        {
            Multinomial m = new Multinomial(2018);
            
-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1]
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(0,1)
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint
+                ValueType = TensorProxy.TensorType.FloatingPoint
            };

            Assert.Throws<ArgumentNullException>(() => m.Eval(src, dst));
-        public void TestSrcWrongShape()
-        {
-            Multinomial m = new Multinomial(2018);
-            
-            Tensor src = new Tensor
-            {
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1]
-            };
-            Tensor dst = new Tensor
-            {
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1]
-            };
-
-            Assert.Throws<ArgumentException>(() => m.Eval(src, dst));
-        }
-        
-        [Test]
-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1, 1]
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(0,1)
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1]
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(0,2)
            };

            Assert.Throws<ArgumentException>(() => m.Eval(src, dst));
        {
            Multinomial m = new Multinomial(2018);
            
-            Tensor src = new Tensor
+            TensorProxy src = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[1, 1]
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(1, 1)
-            Tensor dst = new Tensor
+            TensorProxy dst = new TensorProxy
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = new float[2, 1]
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(2, 1)
            };

            Assert.Throws<ArgumentException>(() => m.Eval(src, dst));
--- a/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/Tests/RandomNormalTest.cs
 using System;
+using Barracuda;
-
-using UnityEngine;
-using System.Collections;
-


 namespace MLAgents.Tests
    {
+
-        public void RandomNormalTestTwoDouble ()
+        public void RandomNormalTestTwoDouble()
-            RandomNormal rn = new RandomNormal (2018);
+            RandomNormal rn = new RandomNormal(2018);
-            Assert.AreEqual (firstValue, rn.NextDouble (), epsilon);
-            Assert.AreEqual (secondValue, rn.NextDouble (), epsilon);
+            Assert.AreEqual(firstValue, rn.NextDouble(), epsilon);
+            Assert.AreEqual(secondValue, rn.NextDouble(), epsilon);
-        public void RandomNormalTestWithMean ()
+        public void RandomNormalTestWithMean()
-            RandomNormal rn = new RandomNormal (2018, 5.0f);
+            RandomNormal rn = new RandomNormal(2018, 5.0f);
-            Assert.AreEqual (firstValue + 5.0, rn.NextDouble (), epsilon);
-            Assert.AreEqual (secondValue + 5.0, rn.NextDouble (), epsilon);
+            Assert.AreEqual(firstValue + 5.0, rn.NextDouble(), epsilon);
+            Assert.AreEqual(secondValue + 5.0, rn.NextDouble(), epsilon);
-        public void RandomNormalTestWithStddev ()
+        public void RandomNormalTestWithStddev()
-            RandomNormal rn = new RandomNormal (2018, 0.0f, 4.2f);
+            RandomNormal rn = new RandomNormal(2018, 0.0f, 4.2f);
-            Assert.AreEqual (firstValue * 4.2, rn.NextDouble (), epsilon);
-            Assert.AreEqual (secondValue * 4.2, rn.NextDouble (), epsilon);
+            Assert.AreEqual(firstValue * 4.2, rn.NextDouble(), epsilon);
+            Assert.AreEqual(secondValue * 4.2, rn.NextDouble(), epsilon);
-        public void RandomNormalTestWithMeanStddev ()
+        public void RandomNormalTestWithMeanStddev()
-            RandomNormal rn = new RandomNormal (2018, mean, stddev);
+            RandomNormal rn = new RandomNormal(2018, mean, stddev);
-            Assert.AreEqual (firstValue * stddev + mean, rn.NextDouble (), epsilon);
-            Assert.AreEqual (secondValue * stddev + mean, rn.NextDouble (), epsilon);
+            Assert.AreEqual(firstValue * stddev + mean, rn.NextDouble(), epsilon);
+            Assert.AreEqual(secondValue * stddev + mean, rn.NextDouble(), epsilon);
-        public void RandomNormalTestTensorInt ()
+        public void RandomNormalTestTensorInt()
-            RandomNormal rn = new RandomNormal (1982);
-            Tensor t = new Tensor {
-                ValueType = Tensor.TensorType.Integer
+            RandomNormal rn = new RandomNormal(1982);
+            TensorProxy t = new TensorProxy
+            {
+                ValueType = TensorProxy.TensorType.Integer
-            Assert.Throws<NotImplementedException> (() => rn.FillTensor (t));
+            Assert.Throws<NotImplementedException>(() => rn.FillTensor(t));
-        public void RandomNormalTestDataNull ()
+        public void RandomNormalTestDataNull()
-            RandomNormal rn = new RandomNormal (1982);
-            Tensor t = new Tensor {
-                ValueType = Tensor.TensorType.FloatingPoint
+            RandomNormal rn = new RandomNormal(1982);
+            TensorProxy t = new TensorProxy
+            {
+                ValueType = TensorProxy.TensorType.FloatingPoint
-            Assert.Throws<ArgumentNullException> (() => rn.FillTensor (t));
+            Assert.Throws<ArgumentNullException>(() => rn.FillTensor(t));
-        public void RandomNormalTestDistribution ()
+        public void RandomNormalTestDistribution()
-            RandomNormal rn = new RandomNormal (2018, mean, stddev);
+            RandomNormal rn = new RandomNormal(2018, mean, stddev);

            int numSamples = 100000;
            // Adapted from https://www.johndcook.com/blog/standard_deviation/
-            for (int i = 0; i < numSamples; i++) {
-                double x = rn.NextDouble ();
-                if (i == 0) {
+            for (int i = 0; i < numSamples; i++)
+            {
+                double x = rn.NextDouble();
+                if (i == 0)
+                {
-                } else {
+                }
+                else
+                {
                    newM = oldM + (x - oldM) / i;
                    newS = oldS + (x - oldM) * (x - newM);


            double sampleMean = newM;
            double sampleVariance = newS / (numSamples - 1);
-            double sampleStddev = Math.Sqrt (sampleVariance);
+            double sampleStddev = Math.Sqrt(sampleVariance);
-            Assert.AreEqual (mean, sampleMean, 0.01);
-            Assert.AreEqual (stddev, sampleStddev, 0.01);
+            Assert.AreEqual(mean, sampleMean, 0.01);
+            Assert.AreEqual(stddev, sampleStddev, 0.01);
-        public void RandomNormalTestTensor ()
+        public void RandomNormalTestTensor()
-            RandomNormal rn = new RandomNormal (1982);
-            Tensor t = new Tensor {
-                ValueType = Tensor.TensorType.FloatingPoint,
-                Data = Array.CreateInstance (typeof (float), new long [3] { 3, 4, 2 })
+            RandomNormal rn = new RandomNormal(1982);
+            TensorProxy t = new TensorProxy
+            {
+                ValueType = TensorProxy.TensorType.FloatingPoint,
+                Data = new Tensor(1, 3, 4, 2)
-            rn.FillTensor (t);
+            rn.FillTensor(t);
-            float [] reference = new float []
+            float[] reference = new float[]
-                0.9561074f,
-                -1.130287f,
-                -0.7763879f,
-                -0.3027347f,
-                -0.1377991f,
-                -0.02921959f,
-                0.9520947f,
-                -0.5018106f,
+                0.3414804f,
+                -1.130287f,
-                -0.07491868f,
+                -0.5105762f,
+                -0.3027347f,
-                0.3331701f,
+                1.225356f,
+                -0.02921959f,
-                1.088157f,
-                0.3414804f,
+                -1.092338f,
+                0.9561074f,
+                -0.5018106f,
-                -0.5105762f,
+                -0.7763879f,
+                -0.07491868f,
-                1.225356f,
+                -0.1377991f,
+                0.3331701f,
-                -1.092338f,
+                0.9520947f,
+                1.088157f,
-            int i = 0;
-            foreach (float f in t.Data) {
-                Assert.AreEqual (f, reference [i], epsilon);
-                ++i;
+            for (var i = 0; i < t.Data.length; i++)
+            {
+                Assert.AreEqual(t.Data[i], reference[i], 0.0001);
-
-
        }
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
+++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/Scripts/PushAgentBasic.cs
    Rigidbody agentRB;  //cached on initialization
    Material groundMaterial; //cached on Awake()
    RayPerception rayPer;
+    
+    float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f };
+    string[] detectableObjects = { "block", "goal", "wall" };

    /// <summary>
    /// We will be changing the ground material based on success/failue
        if (useVectorObs)
        {
            var rayDistance = 12f;
-            float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f };
-            var detectableObjects = new[] { "block", "goal", "wall" };
+            
            AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 0f, 0f));
            AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 1.5f, 0f));
        }
--- a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs
+++ b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Scripts/RayPerception3D.cs
-using System.Collections.Generic;
+using System;
+using System.Collections.Generic;
 using UnityEngine;

 namespace MLAgents
    {
        Vector3 endPosition;
        RaycastHit hit;
+        private float[] subList;

        /// <summary>
        /// Creates perception vector to be used as part of an observation of an agent.
            float[] rayAngles, string[] detectableObjects,
            float startOffset, float endOffset)
        {
+            if (subList == null || subList.Length != detectableObjects.Length + 2)
+                subList = new float[detectableObjects.Length + 2];
+            
+            perceptionBuffer.Capacity = subList.Length * rayAngles.Length;
+            
            // For each ray sublist stores categorical information on detected object
            // along with object distance.
            foreach (float angle in rayAngles)
                        endPosition, Color.black, 0.01f, true);
                }

-                float[] subList = new float[detectableObjects.Length + 2];
+                Array.Clear(subList, 0, subList.Length);
+                
                if (Physics.SphereCast(transform.position +
                                       new Vector3(0f, startOffset, 0f), 0.5f,
                    endPosition, out hit, rayDistance))
                    subList[detectableObjects.Length] = 1f;
                }

-                perceptionBuffer.AddRange(subList);
+                Utilities.AddRangeNoAlloc(perceptionBuffer, subList);
            }

            return perceptionBuffer;
--- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs
+++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/Scripts/AgentSoccer.cs

    public enum Team
    {
-        Red,
+        Red, 
-        Striker,
+        Striker, 
-
+    
-
+    
+    
+    float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f };
+    string[] detectableObjectsRed = { "ball", "redGoal", "blueGoal",
+        "wall", "redAgent", "blueAgent" };
+    string[] detectableObjectsBlue = { "ball", "blueGoal", "redGoal",
+        "wall", "blueAgent", "redAgent" };

    public void ChooseRandomTeam()
    {

        var playerState = new PlayerState
        {
-            agentRB = agentRb,
-            startingPos = transform.position,
+            agentRB = agentRb, 
+            startingPos = transform.position, 
            agentScript = this,
        };
        area.playerStates.Add(playerState);
    public override void CollectObservations()
    {
        float rayDistance = 20f;
-        float[] rayAngles = { 0f, 45f, 90f, 135f, 180f, 110f, 70f };
-            detectableObjects = new[] { "ball", "redGoal", "blueGoal",
-                "wall", "redAgent", "blueAgent" };
+            detectableObjects = detectableObjectsRed;
-            detectableObjects = new[] { "ball", "blueGoal", "redGoal",
-                "wall", "blueAgent", "redAgent" };
+            detectableObjects = detectableObjectsBlue;
        }
        AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 0f, 0f));
        AddVectorObs(rayPer.Perceive(rayDistance, rayAngles, detectableObjects, 1f, 0f));
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
 #pragma kernel Conv2D
 #pragma kernel Conv2D_RegisterBlock4x2
-#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
-#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4
+//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
+//#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4
+#pragma kernel Conv2DKernelKxK_T16x16_R4x4 BLOCK_SIZE=4                                             SUFFIX=KernelKxK_T16x16_R
+#pragma kernel Conv2DKernelKxK_StrictC16K64_T16x16_R4x4 BLOCK_SIZE=4 STRICT_CHANNELS=1              SUFFIX=KernelKxK_StrictC16K64_T16x16_R
+#pragma kernel Conv2DKernel1x1_StrictC16K64_T16x16_R4x4 BLOCK_SIZE=4 KERNEL_1x1=1 STRICT_CHANNELS=1 SUFFIX=Kernel1x1_StrictC16K64_T16x16_R

 #pragma kernel DepthwiseConv2D

 uint4 _Pad;
 uint4 _Stride;

+#define DEBUG_CHECK_BOUNDS 0
+
+// Conv2DBlock64x64_4x4 + index optimizations
+//        T
+//      -1|0             -1|0
+// 16: 142|142ms        144|155ms
+
+float ffma(float a, float b, float c) { return dot(float2(a,c), float2(b,1)); }
+#define FUNC_NAME(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE
+#define CACHE_NAME(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR
+
+#define KERNEL_NAME Conv2D
+
+#if BLOCK_SIZE == 4
+#define TRANSPOSED_X 0
+#define BUF_OFFSET 0
+#define CACHE_DEPTH 16
+groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, X)[CACHE_DEPTH*16*BLOCK_SIZE+(1-TRANSPOSED_X)*CACHE_DEPTH];
+groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, W)[CACHE_DEPTH*16*BLOCK_SIZE];
+[numthreads(16,16,1)]
+void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex)
+{
+    DISPATCH_ARGS(K.kernelCount, O.width * O.height * O.batch, 1);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+
+    // [W*H, Ky*Kx*In] * [Ky*Kx*In, Out] => [W*H, Out]
+
+    #define X_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, X)
+    #define W_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, W)
+
+    int x = (int)dispatchThreadID.x * BLOCK_SIZE; // output_channels
+    int y = (int)dispatchThreadID.y * BLOCK_SIZE; // batch*width*height
+    int tx = (int)groupThreadID.x;
+    int ty = (int)groupThreadID.y;
+    int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE;
+    int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE;
+    int ti = (int)threadIndex;
+    uint w      = O.width;
+    uint h      = O.height;
+    int channels = X.channels;
+    int widthX  = X.width;
+    int heightX = X.height;
+    int strideX = X.channels;
+    int strideK = K.channels;
+    int strideO = O.channels;
+    int offsetX = BUF_OFFSET;
+    int offsetK = BUF_OFFSET;
+    int offsetO = BUF_OFFSET;
+
+    float4 dstA[4];
+    dstA[0].x = B.Get(x+0); dstA[0].y = B.Get(x+1); dstA[0].z = B.Get(x+2); dstA[0].w = B.Get(x+3);
+    dstA[1].x = B.Get(x+0); dstA[1].y = B.Get(x+1); dstA[1].z = B.Get(x+2); dstA[1].w = B.Get(x+3);
+    dstA[2].x = B.Get(x+0); dstA[2].y = B.Get(x+1); dstA[2].z = B.Get(x+2); dstA[2].w = B.Get(x+3);
+    dstA[3].x = B.Get(x+0); dstA[3].y = B.Get(x+1); dstA[3].z = B.Get(x+2); dstA[3].w = B.Get(x+3);
+
+    int readK = strideK * (ti>>6) + bx + (ti&63) + offsetK;
+    #if STRICT_CHANNELS == 1
+    #else
+    bool maskK = (bx + (ti&63)) < strideK;
+    #endif
+
+#if TRANSPOSED_X == 1
+    uint centroidId = by + (ti&63);
+    #if KERNEL_1x1 == 1
+    int readX = strideX * (ti>>6) + centroidId;
+    #else
+    int batch = centroidId / w / h;
+    int topY = (centroidId / w % h) * _Stride.y - _Pad.y;
+    int leftX = (centroidId % w) * _Stride.x - _Pad.x;
+    int cornerId = batch * heightX * widthX + topY * widthX + leftX;
+    int readX = strideX * (ti>>6) + cornerId;
+    bool mask;
+    #endif
+#else
+    uint4 centroidId = uint4(
+        (by + (ti>>4) +  0),
+        (by + (ti>>4) + 16),
+        (by + (ti>>4) + 32),
+        (by + (ti>>4) + 48));
+    #if KERNEL_1x1 == 1
+    int4 readX = strideX * centroidId + (ti&15);
+    #else
+    int4 batch = centroidId / w / h;
+    int4 topY = (centroidId / w % h) * _Stride.y - _Pad.y;
+    int4 leftX = (centroidId % w) * _Stride.x - _Pad.x;
+    int4 cornerId = batch * heightX * widthX + topY * widthX + leftX;
+    int4 readX = strideX * cornerId + (ti&15);
+    bool4 mask;
+    #endif
+#endif
+
+#if KERNEL_1x1 == 1
+    {
+        {
+#else
+    for (int dy = 0; dy < (int)K.GetKernelHeight(); dy++)
+    {
+        for (int dx = 0; dx < (int)K.GetKernelWidth(); dx++)
+        {
+            int kernelOffsetX = (dy * widthX + dx) * strideX;
+            mask =
+                topY + dy >= 0 &&
+                topY + dy < heightX &&
+                leftX + dx >= 0 &&
+                leftX + dx < widthX;
+#endif // KERNEL_1x1
+            for (int i = 0; i < channels; i += CACHE_DEPTH)
+            {
+                #if STRICT_CHANNELS == 1
+                #else
+                if (i + CACHE_DEPTH > channels)
+                {
+                    int channelRemainder = channels - i;
+                    [unroll] for (int j = 0; j < 4; ++j)
+                    {
+                        bool maskChannelsK = ti < 64 * (channelRemainder - j * 4);
+                        bool maskChannelsX = 
+                            #if TRANSPOSED_X == 1
+                            maskChannelsK;
+                            #else
+                            (ti&15) < channelRemainder;
+                            #endif
+
+                        W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = 
+                            (maskK & maskChannelsK) ? K.data[readK] : 0;
+                        readK += strideK * max(0, min(channelRemainder - j * 4, 4));
+
+                        #if TRANSPOSED_X == 1
+                        X_[ti + 256*j] =
+                            #if KERNEL_1x1 == 1
+                            maskChannelsX ? X.data[readX + strideX * (i + j * 4) + offsetX]: 0;
+                            #else
+                            (mask && maskChannelsX) ? X.data[readX + strideX * (i + j * 4) + kernelOffsetX + offsetX]: 0;
+                            #endif
+                        #else
+                        X_[(ti>>4) + 65*(ti&15) + 16*j] =
+                            #if KERNEL_1x1 == 1
+                            maskChannelsX ? X.data[readX[j] + i + offsetX]: 0;
+                            #else
+                            (mask[j] && maskChannelsX) ? X.data[readX[j] + i + kernelOffsetX + offsetX]: 0;
+                            #endif
+                        #endif
+                    }
+                }
+                else
+                #endif
+                [unroll] for (int j = 0; j < 4; ++j)
+                {
+                    W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] =
+                        #if STRICT_CHANNELS == 1
+                        K.data[readK];
+                        #else
+                        maskK ? K.data[readK]: 0;
+                        #endif
+                    readK += strideK * 4;
+
+                    #if TRANSPOSED_X == 1
+                    X_[ti + 256*j] = 
+                        #if KERNEL_1x1 == 1
+                        X.data[readX + strideX * (i + j * 4) + offsetX];
+                        #else
+                        mask ? X.data[readX + strideX * (i + j * 4) + kernelOffsetX + offsetX]: 0;
+                        #endif
+                    #else
+                    X_[(ti>>4) + 65*(ti&15) + 16*j] =
+                        #if KERNEL_1x1 == 1
+                        X.data[readX[j] + i + offsetX];
+                        #else
+                        mask[j] ? X.data[readX[j] + i + kernelOffsetX + offsetX]: 0;
+                        #endif
+                    #endif
+
+                    #if DEBUG_CHECK_BOUNDS && KERNEL_1x1 == 0
+                    if (mask[j] && readX[j] + i + kernelOffsetX < 0)
+                        X_[(ti>>4) + 65*(ti&15) + 16*j] = -1;
+                    if (mask[j] && readX[j] + i + kernelOffsetX >= X.GetLength())
+                        X_[(ti>>4) + 65*(ti&15) + 16*j] = -1;
+                    #endif
+                }
+
+                GroupMemoryBarrierWithGroupSync();
+
+                int4 idX = int4(0,1,2,3);
+                int4 idW = int4(0,16,32,48);
+                int incX = 64 + (1-TRANSPOSED_X);
+                int incW = 64;
+
+                for (int di = 0; di < CACHE_DEPTH; di++)
+                {
+                    float4 srcX = float4(
+                        X_[idX.x + ty*4],
+                        X_[idX.y + ty*4],
+                        X_[idX.z + ty*4],
+                        X_[idX.w + ty*4]);
+                    float4 srcW = float4(
+                        W_[idW.x + tx],
+                        W_[idW.y + tx],
+                        W_[idW.z + tx],
+                        W_[idW.w + tx]
+                    );
+                    idX += incX;
+                    idW += incW;
+
+                    dstA[0].x = ffma(srcX.x, srcW.x, dstA[0].x);
+                    dstA[0].y = ffma(srcX.x, srcW.y, dstA[0].y);
+                    dstA[0].z = ffma(srcX.x, srcW.z, dstA[0].z);
+                    dstA[0].w = ffma(srcX.x, srcW.w, dstA[0].w);
+
+                    dstA[1].x = ffma(srcX.y, srcW.x, dstA[1].x);
+                    dstA[1].y = ffma(srcX.y, srcW.y, dstA[1].y);
+                    dstA[1].z = ffma(srcX.y, srcW.z, dstA[1].z);
+                    dstA[1].w = ffma(srcX.y, srcW.w, dstA[1].w);
+
+                    dstA[2].x = ffma(srcX.z, srcW.x, dstA[2].x);
+                    dstA[2].y = ffma(srcX.z, srcW.y, dstA[2].y);
+                    dstA[2].z = ffma(srcX.z, srcW.z, dstA[2].z);
+                    dstA[2].w = ffma(srcX.z, srcW.w, dstA[2].w);
+
+                    dstA[3].x = ffma(srcX.w, srcW.x, dstA[3].x);
+                    dstA[3].y = ffma(srcX.w, srcW.y, dstA[3].y);
+                    dstA[3].z = ffma(srcX.w, srcW.z, dstA[3].z);
+                    dstA[3].w = ffma(srcX.w, srcW.w, dstA[3].w);
+                }
+
+                GroupMemoryBarrierWithGroupSync();
+            }
+        }
+    }
+
+    [unroll] for (int sy = 0; sy < 4 && y+sy < (int)w * (int)h * (int)O.batch; ++sy)
+        [unroll] for (int sx = 0; sx < 4 && x+sx < strideO; ++sx)
+            O.data[strideO * (y+sy) + x+sx + offsetO] = dstA[sy][sx];
+
+    #undef X_
+    #undef W_
+}
+#else
+#endif
+#undef TRANSPOSED_X
+#undef CACHE_DEPTH
+#undef BUF_OFFSET
+#undef KERNEL_NAME
+
 NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
 void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
 CONV2D_L1CACHED(32,4, fastfma)


-
+// IDEA: iterate over channels in the inner loop - needs channels first layout
 NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
 void DepthwiseConv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
 #pragma kernel ScaleBias_CNyx
 #pragma kernel ScaleBias_CNyx2
 #pragma kernel ScaleBias_Flat
+#pragma kernel ScaleBias_Loop
 #pragma kernel Upsample2D
 #pragma kernel AvgPool2D
 #pragma kernel MaxPool2D
 #pragma kernel GlobalAvgPool2D
 #pragma kernel InstanceNorm
+#pragma kernel InstanceNormTail_CNyx2
+#pragma kernel InstanceNormTail_Flat
 #pragma kernel Copy

 /*
 uint4 _Stride;
 uint4 _Pad;
 float _Alpha;
+uint _LoopStride;

 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void ScaleBias(uint3 dispatchThreadID : SV_DispatchThreadID)
    float v = X.Get(i);
    v = v * scale + bias;
    O.Set(i, v);
+}
+
+NUMTHREADS((256,1,1), (128,1,1), (64,1,1))
+void ScaleBias_Loop(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.length, 1, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+
+    uint i = dispatchThreadID.x;
+    uint len = O.GetLength();
+
+    while (i < len) 
+    {
+        uint c = i % X.channels;
+        float bias = B.Get(c);
+        float scale = W.Get(c);
+        
+        float v = X.Get(i);
+        v = v * scale + bias;
+        O.Set(i, v);
+    
+        i += _LoopStride;
+    }
 }

 NUMTHREADS((32,4,1), (32,2,1), (16,2,1))
                O.Set(n, y, x, c, v);
            }
    }
+}
+
+NUMTHREADS((256,1,1), (128,1,1), (64,1,1))
+void InstanceNormTail_Flat(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.length, 1, 1);
+    TENSOR_ARGS4(X, W, B, O);
+
+    uint i = dispatchThreadID.x;
+    if (i > O.GetLength()) return;
+
+    uint c = i % X.channels;
+
+    float variance = W.Get(c);
+    float mean = B.Get(c);
+    // normalization factor
+    float invNormFactor = 1 / sqrt(variance + FLT_EPSILON);
+
+    float v = X.Get(i);
+    //v = gamma * (v * invNormFactor - mean * invNormFactor) + beta
+    v = v * invNormFactor - mean * invNormFactor;
+
+    O.Set(i, v);
+}
+
+NUMTHREADS((32,4,1), (32,2,1), (16,2,1))
+void InstanceNormTail_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS4(X, W, B, O);
+
+    uint c = dispatchThreadID.x;
+    uint i = dispatchThreadID.y * X.channels + c;
+
+    if (c >= X.channels) return;
+    if (i >= X.GetLength()) return;
+
+    float variance = W.Get(c);
+    float mean = B.Get(c);
+    // normalization factor
+    float invNormFactor = 1 / sqrt(variance + FLT_EPSILON);
+
+    float v = X.Get(i);
+    //v = gamma * (v * invNormFactor - mean * invNormFactor) + beta
+    v = v * invNormFactor - mean * invNormFactor;
+
+    O.Set(i, v);
 }

 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
 # Release notes

+## 0.2.4
+- Switched to 2018.4.3f1 as primary Unity version for testing.
+- Fixed ScaleBias scheduling issue with large amounts of data (reproduced with MobileNet @ 16 batch)
+- Fixed buffer overrun in ThreadGroup SharedMemory when TRANSPOSE_X and/or SHIFTED_X paths are enabled. This should fix GPU worker issues on Windows.
+- Added string cache to minimise string concat generated GC pressure.
+- Added small fixes for temp memory allocations, saves ~200B per layer.
+- Refactored inner loop workings, to avoid GC allocations for delegates.
+- Fixed input handling for layers, now inputs are not regenerated with every execution. Static model tensors are to stay forever until worker is disposed.
+- Bumped Burst version to 1.1.1.
+
+## 0.2.3
+- Rewritten Dense, Conv and some other ops on GPU. Speedup of 33% in most models with batch=1 and over 100% for batch=16.
+- Optimizations: reimplemented InstanceNormalization using pyramid approach for calculating mean and variance.
+
 ## 0.2.2
 - Added support for --print-supported-ops flag for model converters, now it will print approximate list of supported operations. List of supported ops depends on converter.
 - Added Keras converter as part of distribution.
 - Renaldas (ReJ) Zioma
 - Mantas Puida
 - Vladimir Oster
+- Aurimas Petrovas
 - Martin Sternevald
 - Valdemar Bučilko
 - Kuba Cupisz
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
 {
    "name": "com.unity.barracuda",
    "displayName": "Barracuda",
-    "version": "0.2.2-preview",
+    "version": "0.2.4-preview",
    "unity": "2017.4",
    "description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.",
    "dependencies": {}
--- a/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
                    brain.brainParameters.vectorObservationSize,
                    info.vectorObservation.Count));
            }
-
-            info.stackedVectorObservation.RemoveRange(
-                0, param.vectorObservationSize);
-            info.stackedVectorObservation.AddRange(info.vectorObservation);
+            
+            Utilities.ShiftLeft(info.stackedVectorObservation, param.vectorObservationSize);
+            Utilities.ReplaceRange(info.stackedVectorObservation, info.vectorObservation, 
+                                    info.stackedVectorObservation.Count - info.vectorObservation.Count);

            info.visualObservations.Clear();
            var visualObservationCount = agentParameters.agentCameras.Count+agentParameters.agentRenderTextures.Count;
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs
 using System.Collections.Generic;
 using System.Linq;
+using Barracuda;
 using MLAgents.InferenceBrain.Utils;
 using UnityEngine;

    /// </summary>
    public class ContinuousActionOutputApplier : TensorApplier.Applier
    {
-        public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo)
-            var tensorDataAction = tensor.Data as float[,];
-            var actionSize = tensor.Shape[tensor.Shape.Length - 1];
+            var actionSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1];    
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
-                    action[j] = tensorDataAction[agentIndex, j];
+                    action[j] = tensorProxy.Data[agentIndex, j];
                }
                agent.UpdateVectorAction(action);
                agentIndex++;
    {
        private int[] _actionSize;
        private Multinomial _multinomial;
+        private ITensorAllocator _allocator;
-        public DiscreteActionOutputApplier(int[] actionSize, int seed)
+        public DiscreteActionOutputApplier(int[] actionSize, int seed, ITensorAllocator allocator)
+            _allocator = allocator;
-        public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo)
-            var tensorDataProbabilities = tensor.Data as float[,];
+            //var tensorDataProbabilities = tensorProxy.Data as float[,];
            var batchSize = agentInfo.Keys.Count;
            var actions = new float[batchSize, _actionSize.Length];
            var startActionIndices = Utilities.CumSum(_actionSize);
-                var actionProbs = new float[batchSize, nBranchAction];
+                var actionProbs = new TensorProxy()
+                {
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
+                    Shape = new long[]{batchSize, nBranchAction},
+                    Data = _allocator.Alloc(new TensorShape(batchSize, nBranchAction))
+                };
+                
                for (var batchIndex = 0; batchIndex < batchSize; batchIndex++)
                {
                    for (var branchActionIndex = 0; 
-                        actionProbs[batchIndex, branchActionIndex] = 
-                            tensorDataProbabilities[
-                                batchIndex, startActionIndices[actionIndex] + branchActionIndex];
+                        actionProbs.Data[batchIndex, branchActionIndex] = 
+                            tensorProxy.Data[batchIndex, startActionIndices[actionIndex] + branchActionIndex];
-                var inputTensor = new Tensor()
-                {
-                    ValueType = Tensor.TensorType.FloatingPoint,
-                    Shape = new long[]{batchSize, _actionSize[actionIndex]},
-                    Data = actionProbs
-                };
-                var outputTensor = new Tensor()
+                
+                var outputTensor = new TensorProxy()
-                    ValueType = Tensor.TensorType.FloatingPoint,
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
-                    Data = new float[batchSize, 1]
+                    Data = _allocator.Alloc(new TensorShape(batchSize, 1))
-                _multinomial.Eval(inputTensor, outputTensor);
-                var outTensor = outputTensor.Data as float[,];
+                
+                _multinomial.Eval(actionProbs, outputTensor);
+                
-                    actions[ii, actionIndex] = outTensor[ii, 0];
+                    actions[ii, actionIndex] = outputTensor.Data[ii, 0];
                }
            }
            var agentIndex = 0;
            this.memoryIndex = memoryIndex;
        }
        
-        public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo)
-            var tensorDataMemory = tensor.Data as float[,];
-            var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1];
+            var memorySize = (int)tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            
            foreach (var agent in agentInfo.Keys)
            {

                for (var j = 0; j < memorySize; j++)
                {
-                    memory[memorySize * memoryIndex + j] = tensorDataMemory[agentIndex, j];
+                    memory[memorySize * memoryIndex + j] = tensorProxy.Data[agentIndex, j];
                }
                
                agent.UpdateMemoriesAction(memory);
    /// </summary>
    public class MemoryOutputApplier : TensorApplier.Applier
    {
-        public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo)
-            var tensorDataMemory = tensor.Data as float[,];
-            var memorySize = tensor.Shape[tensor.Shape.Length - 1];
+            var memorySize = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
-                    memory.Add(tensorDataMemory[agentIndex, j]);
+                    memory.Add(tensorProxy.Data[agentIndex, j]);
                }

                agent.UpdateMemoriesAction(memory);
    /// </summary>
    public class ValueEstimateApplier : TensorApplier.Applier
    {
-        public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo)
-            var tensorDataValue = tensor.Data as float[,];
-                agent.UpdateValueAction(tensorDataValue[agentIndex, 0]);
+                agent.UpdateValueAction(tensorProxy.Data[agentIndex, 0]);
                agentIndex++;
            }
        }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
-#define ENABLE_BARRACUDA
-#if ENABLE_BARRACUDA
-using System;
+using System;
-using UnityEngine;
-using Tensor = MLAgents.InferenceBrain.Tensor;
+using MLAgents.InferenceBrain;

 namespace MLAgents.InferenceBrain
 {
        /// <summary>
        /// Generates the Tensor inputs that are expected to be present in the Model. 
        /// </summary>
-        /// <returns>Tensor IEnumerable with the expected Tensor inputs</returns>
-        public IReadOnlyList<Tensor> GetInputTensors()
+        /// <returns>TensorProxy IEnumerable with the expected Tensor inputs</returns>
+        public IReadOnlyList<TensorProxy> GetInputTensors()
-            List<Tensor> tensors = new List<Tensor>();
+            List<TensorProxy> tensors = new List<TensorProxy>();

            if (_model == null)
                return tensors;
-                tensors.Add(new Tensor
+                tensors.Add(new TensorProxy
-                    ValueType = Tensor.TensorType.FloatingPoint,
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
                    Data = null,
                    Shape = input.shape.Select(i => (long)i).ToArray()
                });
            {
-                //Debug.Log($"{mem.input}: {mem.shape} -> {BarracudaUtils.FromBarracuda(mem.shape).Length}");
-                tensors.Add(new Tensor
+                //Debug.Log($"{mem.input}: {mem.shape} -> {BarracudaUtils.TensorShapeFromBarracuda(mem.shape).Length}");
+                tensors.Add(new TensorProxy
-                    ValueType = Tensor.TensorType.FloatingPoint,
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
-                    Shape = BarracudaUtils.FromBarracuda(mem.shape)
+                    Shape = TensorUtils.TensorShapeFromBarracuda(mem.shape)
                });
            }
            
        /// <summary>
        /// Generates the Tensor outputs that are expected to be present in the Model. 
        /// </summary>
-        /// <returns>Tensor IEnumerable with the expected Tensor outputs</returns>
+        /// <returns>TensorProxy IEnumerable with the expected Tensor outputs</returns>
        public string[] GetOutputNames()
        {
            var names = new List<string>();
        private void CheckInputTensorShape()
        {
            var tensorTester =
-                new Dictionary<string, Func<Tensor, string>>()
+                new Dictionary<string, Func<TensorProxy, string>>()
                {
                    {TensorNames.VectorObservationPlacholder, CheckVectorObsShape},
                    {TensorNames.PreviousActionPlaceholder, CheckPreviousActionShape},
        /// Checks that the shape of the Vector Observation input placeholder is the same in the
        /// model and in the Brain Parameters.
        /// </summary>
-        /// <param name="tensor"> The tensor that is expected by the model</param>
+        /// <param name="tensorProxy"> The tensor that is expected by the model</param>
-        private string CheckVectorObsShape(Tensor tensor)
+        private string CheckVectorObsShape(TensorProxy tensorProxy)
-            var totalVecObsSizeT = tensor.Shape[tensor.Shape.Length - 1];
+            var totalVecObsSizeT = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            if (vecObsSizeBp * numStackedVector != totalVecObsSizeT)
            {
                return string.Format(
        /// Checks that the shape of the Previous Vector Action input placeholder is the same in the
        /// model and in the Brain Parameters.
        /// </summary>
-        /// <param name="tensor"> The tensor that is expected by the model</param>
+        /// <param name="tensorProxy"> The tensor that is expected by the model</param>
-        private string CheckPreviousActionShape(Tensor tensor)
+        private string CheckPreviousActionShape(TensorProxy tensorProxy)
-            var numberActionsT = tensor.Shape[tensor.Shape.Length - 1];
+            var numberActionsT = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            if  (numberActionsBp != numberActionsT)
            {
                return string.Format(
        /// Checks that the shape of the visual observation input placeholder is the same in the
        /// model and in the Brain Parameters.
        /// </summary>
-        /// <param name="tensor"> The tensor that is expected by the model</param>
+        /// <param name="tensorProxy"> The tensor that is expected by the model</param>
-        private string CheckVisualObsShape(Tensor tensor, int visObsIndex)
+        private string CheckVisualObsShape(TensorProxy tensorProxy, int visObsIndex)
-            var heightT = tensor.Shape[1];
-            var widthT = tensor.Shape[2];
-            var pixelT = tensor.Shape[3];
+            var heightT = tensorProxy.Shape[1];
+            var widthT = tensorProxy.Shape[2];
+            var pixelT = tensorProxy.Shape[3];
-                    "Received Tensor of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].",
+                    "Received TensorProxy of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].",
                    visObsIndex, widthBp, heightBp, pixelBp, widthT, heightT, pixelT);
            }
            return null;
            return null;
        }
    }
-}
-
-public class BarracudaUtils
-{
-    private static Array LinearizeArray(Array src)  
-    {
-        var elementType = src.GetType().GetElementType();
-        var elementSize = Marshal.SizeOf(elementType);
-        var dest = Array.CreateInstance(elementType, src.Length);
-        Buffer.BlockCopy(src, 0, dest, 0, src.Length * elementSize);
-        return dest;
-    }
-    
-    protected static Barracuda.TensorShape ToBarracuda(long[] src)
-    {
-        if (src.Length > 4)
-            throw new NotImplementedException("Barracuda does not support Tensor shapes with rank higher than 4");
-
-        var shape = new int[4];
-
-        if (src.Length == 2)
-        {
-            shape[0] = (int)src[0];
-            shape[1] = 1;
-            shape[2] = 1;
-            shape[3] = (int)src[1];
-        }
-        else
-        {
-            for (var axis = 0; axis < src.Length; ++axis)
-                shape[shape.Length-axis-1] = (int)src[src.Length-axis-1];
-        }
-        
-        return new Barracuda.TensorShape(shape);
-    }
-    
-    private static float[] IntArrayToFloatArray(int[] src)
-    {
-        var dest = new float[src.Length];
-        for (var i = 0; i < src.Length; i++)
-            dest[i] = (float) src[i];
-
-        return dest;
-    }
-    
-    public static Barracuda.Tensor ToBarracuda(MLAgents.InferenceBrain.Tensor src)
-    {
-        Array linearArray = LinearizeArray(src.Data);
-
-        if (linearArray.GetType().GetElementType() == typeof(int))
-            linearArray = IntArrayToFloatArray(linearArray as int[]);
-
-        var shape = ToBarracuda(src.Shape);
-        return new Barracuda.Tensor(shape,  linearArray as float[], src.Name);
-    }
-    
-    internal static long[] FromBarracuda(Barracuda.TensorShape src)
-    {
-        if (src.height == 1 && src.width == 1)
-            return new long[2] {src.batch, src.channels};
-
-        return new long[4] {src.batch, src.height, src.width, src.channels};
-    }
-    
-    private static Array ReshapeArray(Array src, long[] shape)
-    {
-        var elementType = src.GetType().GetElementType();
-        var elementSize = Marshal.SizeOf(elementType);
-        var dest = Array.CreateInstance(elementType, shape);
-        Buffer.BlockCopy(src, 0, dest, 0, dest.Length * elementSize);
-        return dest;
-    }
-    
-    public static Tensor FromBarracuda(Barracuda.Tensor src, string nameOverride = null)
-    {
-        var shape = FromBarracuda(src.shape);
-        return new Tensor
-        {
-            Name = nameOverride ?? src.name,
-            ValueType = Tensor.TensorType.FloatingPoint,
-            Shape = shape,
-            Data = ReshapeArray(src.data.Download(src.length), shape)
-        };
-    }
-}
-#endif
+}
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
    /// </summary>
    public class BiDimensionalOutputGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+
+        public BiDimensionalOutputGenerator(ITensorAllocator allocator)
-            var shapeSecondAxis = tensor.Shape[tensor.Shape.Length - 1];
-            tensor.Shape[0] = batchSize;
-            if (tensor.ValueType == Tensor.TensorType.FloatingPoint)
-            {
-                tensor.Data = new float[batchSize, shapeSecondAxis];
-            }
-            else
-            {
-                tensor.Data = new int[batchSize, shapeSecondAxis];
-            }
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
        }
    }

    /// </summary>
    public class BatchSizeGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+
+        public BatchSizeGenerator(ITensorAllocator allocator)
-            tensor.Data = new int[] {batchSize};
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            tensorProxy.Data = _allocator.Alloc(new TensorShape(1,1));
+            tensorProxy.Data[0] = batchSize;
        }
    }

    /// </summary>
    public class SequenceLengthGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+
+        public SequenceLengthGenerator(ITensorAllocator allocator)
-            tensor.Shape = new long[0];
-            tensor.Data = new int[] {1};
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            tensorProxy.Shape = new long[0];
+            tensorProxy.Data = _allocator.Alloc(new TensorShape(1,1));
+
+            tensorProxy.Data[0] = 1;
        }
    }

    /// </summary>
    public class VectorObservationGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+        public VectorObservationGenerator(ITensorAllocator allocator)
-            tensor.Shape[0] = batchSize;
-            var vecObsSizeT = tensor.Shape[tensor.Shape.Length - 1];
-            var floatArray = new float[batchSize, vecObsSizeT];
-            tensor.Data = floatArray;
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            var vecObsSizeT = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
+            
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
-                    floatArray[agentIndex, j] = vectorObs[j];
+                    tensorProxy.Data[agentIndex, j] = vectorObs[j];
                }
                agentIndex++;
            }
    /// </summary>
    public class RecurrentInputGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+        
+        public RecurrentInputGenerator(ITensorAllocator allocator)
+        {
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
-            tensor.Shape[0] = batchSize;
-            var memorySize = tensor.Shape[tensor.Shape.Length - 1];
-            var floatArray = new float[batchSize, memorySize];
-            tensor.Data = floatArray;
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            
+            var memorySize = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
                    {
                        break;
                    }
-                    floatArray[agentIndex, j] = memory[j];
+                    tensorProxy.Data[agentIndex, j] = memory[j];
                }
                agentIndex++;
            }
    {
        private int memoriesCount;
        private int memoryIndex;
+        private ITensorAllocator _allocator;   
-        public BarracudaRecurrentInputGenerator(int memoryIndex)
+        public BarracudaRecurrentInputGenerator(int memoryIndex, ITensorAllocator allocator)
+            _allocator = allocator;
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
-            tensor.Shape[0] = batchSize;
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
-            var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1];
-            
-            tensor.Data = new float[batchSize, memorySize];
+            var memorySize = (int)tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
                    {
                        break;
                    }
-                    tensor.Data.SetValue(memory[j + offset], new int[2] {agentIndex, j});
+                    tensorProxy.Data[agentIndex, j] = memory[j + offset];
                }
                agentIndex++;
            }
    /// </summary>
    public class PreviousActionInputGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+
+        public PreviousActionInputGenerator(ITensorAllocator allocator)
-            tensor.Shape[0] = batchSize;
-            var actionSize = tensor.Shape[tensor.Shape.Length - 1];
-            var intArray = new int[batchSize, actionSize];
-            tensor.Data = intArray;
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            
+            var actionSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
-                    intArray[agentIndex, j] = (int) pastAction[j];
+                    tensorProxy.Data[agentIndex, j] = pastAction[j];
                }

                agentIndex++;
    /// </summary>
    public class ActionMaskInputGenerator : TensorGenerator.Generator
    {
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        private ITensorAllocator _allocator;
+
+        public ActionMaskInputGenerator(ITensorAllocator allocator)
-            tensor.Shape[0] = batchSize;
-            var maskSize = tensor.Shape[tensor.Shape.Length - 1];
-            var floatArray = new float[batchSize, maskSize];
-            tensor.Data = floatArray;
+            _allocator = allocator;
+        }
+        
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        {
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            
+            var maskSize = tensorProxy.Shape[tensorProxy.Shape.Length - 1];
            var agentIndex = 0;
            foreach (var agent in agentInfo.Keys)
            {
                    var isUnmasked = (maskList != null && maskList[j]) ? 0.0f : 1.0f;
-                    floatArray[agentIndex, j] = isUnmasked;
+                    tensorProxy.Data[agentIndex, j] = isUnmasked;
                }
                agentIndex++;
            }
    public class RandomNormalInputGenerator : TensorGenerator.Generator
    {
        private RandomNormal _randomNormal;
+        private ITensorAllocator _allocator;
-        public RandomNormalInputGenerator(int seed)
+        public RandomNormalInputGenerator(int seed, ITensorAllocator allocator)
+            _allocator = allocator;
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
-            tensor.Shape[0] = batchSize;
-            var actionSize = tensor.Shape[tensor.Shape.Length - 1];
-            tensor.Data = new float[batchSize, actionSize];
-            _randomNormal.FillTensor(tensor);
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            _randomNormal.FillTensor(tensorProxy);
        }
    }

    {
        private int _index;
        private bool _grayScale;
-        public VisualObservationInputGenerator(int index, bool grayScale)
+        private ITensorAllocator _allocator;
+        
+        public VisualObservationInputGenerator(int index, bool grayScale, ITensorAllocator allocator)
+            _allocator = allocator;
-        public void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
+        public void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo)
-            tensor.Data = Utilities.TextureToFloatArray(textures, _grayScale);
-            tensor.Shape[0] = textures.Count;
+            
+            TensorUtils.ResizeTensor(tensorProxy, batchSize, _allocator);
+            Utilities.TextureToTensorProxy(tensorProxy, textures, _grayScale, _allocator);
        } 
    } 
 }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ModelParamLoader.cs
 #if ENABLE_TENSORFLOW
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
+using Barracuda;

 namespace MLAgents.InferenceBrain
 {
        /// <summary>
        /// Generates the Tensor inputs that are expected to be present in the Model. 
        /// </summary>
-        /// <returns>Tensor IEnumerable with the expected Tensor inputs</returns>
-        public IReadOnlyList<Tensor> GetInputTensors()
+        /// <returns>TensorProxy IEnumerable with the expected Tensor inputs</returns>
+        public IReadOnlyList<TensorProxy> GetInputTensors()
        {
            return _engine?.InputFeatures();
        }
        /// </summary>
-        /// <returns>Tensor IEnumerable with the expected Tensor outputs</returns>
-        public IReadOnlyList<Tensor> GetOutputTensors()
+        /// <returns>TensorProxy IEnumerable with the expected Tensor outputs</returns>
+        public IReadOnlyList<TensorProxy> GetOutputTensors()
-            var tensorList = new List<Tensor>();
+            var tensorList = new List<TensorProxy>();
-                tensorList.Add(new Tensor()
+                tensorList.Add(new TensorProxy()
                {
                    Name = TensorNames.ActionOutput,
                    Shape = new long[]
-                    ValueType = Tensor.TensorType.FloatingPoint,
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
                    Data = null
                });
            }
-                    new Tensor()
+                    new TensorProxy()
                    {
                        Name = TensorNames.ActionOutput,
                        Shape = new long[]
-                        ValueType = Tensor.TensorType.FloatingPoint,
+                        ValueType = TensorProxy.TensorType.FloatingPoint,
                        Data = null
                    });
            }
-                tensorList.Add(new Tensor()
+                tensorList.Add(new TensorProxy()
                {
                    Name = TensorNames.RecurrentOutput,
                    Shape = new long[2]
-                    ValueType = Tensor.TensorType.FloatingPoint,
+                    ValueType = TensorProxy.TensorType.FloatingPoint,
                    Data = null
                });
            }
        /// <returns>The value of the scalar variable in the model. (-1 if not found)</returns>
        private int GetIntScalar(string name)
        {
-            var outputs = new Tensor[]
+            var outputs = new TensorProxy[]
-                new Tensor()
+                new TensorProxy()
-                    ValueType = Tensor.TensorType.Integer,
+                    ValueType = TensorProxy.TensorType.Integer,
-                    Data = new long[1]
+                    Data = new Tensor(1,1)
-                _engine.ExecuteGraph(new Tensor[0], outputs);
+                _engine.ExecuteGraph(new TensorProxy[0], outputs);
-            catch
+            catch (Exception ex)
+                UnityEngine.Debug.LogError($"Failed to execute GetIntScalar()\n{ex}");
-            return (outputs[0].Data as int[])[0];
+            return (int)outputs[0].Data[0];
        }

        /// <summary>
        private void CheckInputTensorShape()
        {
            var tensorTester =
-                new Dictionary<string, Func<Tensor, string>>()
+                new Dictionary<string, Func<TensorProxy, string>>()
                {
                    {TensorNames.VectorObservationPlacholder, CheckVectorObsShape},
                    {TensorNames.PreviousActionPlaceholder, CheckPreviousActionShape},
        /// <param name="tensor"> The tensor that is expected by the model</param>
        /// <returns>If the Check failed, returns a string containing information about why the
        /// check failed. If the check passed, returns null.</returns>
-        private string CheckVectorObsShape(Tensor tensor)
+        private string CheckVectorObsShape(TensorProxy tensor)
        {
            var vecObsSizeBp = _brainParameters.vectorObservationSize;
            var numStackedVector = _brainParameters.numStackedVectorObservations;
        /// <param name="tensor"> The tensor that is expected by the model</param>
        /// <returns>If the Check failed, returns a string containing information about why the
        /// check failed. If the check passed, returns null.</returns>
-        private string CheckPreviousActionShape(Tensor tensor)
+        private string CheckPreviousActionShape(TensorProxy tensor)
        {
            var numberActionsBp = _brainParameters.vectorActionSize.Length;
            var numberActionsT = tensor.Shape[1];
        /// <param name="visObsIndex"> The index of the visual observation.</param>
        /// <returns>If the Check failed, returns a string containing information about why the
        /// check failed. If the check passed, returns null.</returns>
-        private string CheckVisualObsShape(Tensor tensor, int visObsIndex)
+        private string CheckVisualObsShape(TensorProxy tensor, int visObsIndex)
        {
            var resolutionBp = _brainParameters.cameraResolutions[visObsIndex];
            var widthBp = resolutionBp.width;
            {
                return string.Format(
                    "The visual Observation {0} of the model does not match. " +
-                    "Received Tensor of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].",
+                    "Received TensorProxy of shape [?x{1}x{2}x{3}] but was expecting [?x{4}x{5}x{6}].",
                    visObsIndex, widthBp, heightBp, pixelBp, widthT, heightT, pixelT);
            }
            return null;
                    "suggest Continuous Control.");
                return;
            }
-            var tensorTester = new Dictionary<string, Func<Tensor, int, string>>();
+            var tensorTester = new Dictionary<string, Func<TensorProxy, int, string>>();
            if (_brainParameters.vectorActionSpaceType == SpaceType.continuous)
            {
                tensorTester[TensorNames.ActionOutput] = CheckContinuousActionOutputShape;
        /// by the model.</param>
        /// <returns>If the Check failed, returns a string containing information about why the
        /// check failed. If the check passed, returns null.</returns>
-        private string CheckDiscreteActionOutputShape(Tensor tensor, int modelActionSize)
+        private string CheckDiscreteActionOutputShape(TensorProxy tensor, int modelActionSize)
        {
            var bpActionSize = _brainParameters.vectorActionSize.Sum();
            if  (modelActionSize != bpActionSize)
        /// by the model.</param>
        /// <returns>If the Check failed, returns a string containing information about why the
        /// check failed. If the check passed, returns null.</returns>
-        private string CheckContinuousActionOutputShape(Tensor tensor, int modelActionSize)
+        private string CheckContinuousActionOutputShape(TensorProxy tensor, int modelActionSize)
        {
            var bpActionSize = _brainParameters.vectorActionSize[0];
            if  (modelActionSize != bpActionSize)
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TFSharpInferenceEngine.cs
 using System;
 using UnityEngine.Profiling;
 using System.Runtime.InteropServices;
+using Barracuda;
 using UnityEngine;

 namespace MLAgents.InferenceBrain
            Profiler.EndSample();
        }

-        public int ExecuteGraph(IEnumerable<Tensor> inputs_it, IEnumerable<Tensor> outputs_it)
+        public int ExecuteGraph(IEnumerable<TensorProxy> inputs_it, IEnumerable<TensorProxy> outputs_it)
-            Tensor[] inputs = inputs_it.ToArray();
-            Tensor[] outputs = outputs_it.ToArray();
+            TensorProxy[] inputs = inputs_it.ToArray();
+            TensorProxy[] outputs = outputs_it.ToArray();
-            inputs.ToList().ForEach((Tensor input) =>
-            {
+            inputs.ToList().ForEach((TensorProxy input) =>
+            {   
-                    var data = input.Data.GetValue(0);
+                    var data = input.Data[0];
                    if (input.DataType == typeof(int))
                    {
                        runner.AddInput(m_graph[input.Name][0], (int)data);
                }
                else
                {
-                    runner.AddInput(m_graph[input.Name][0], input.Data);
+                    runner.AddInput(m_graph[input.Name][0], input.DataType == typeof(int) ?
+                                                            TensorUtils.BarracudaToIntArray(input.Data) :
+                                                            TensorUtils.BarracudaToFloatArray(input.Data));
                }
            });

                if (outputs[i].Shape.Length == 0)
                {
                    // Handle scalars
-                    outputs[i].Data = Array.CreateInstance(outputs[i].DataType, new long[1] {1}); 
-                    outputs[i].Data.SetValue(out_tensors[i].GetValue(), 0);
+                    outputs[i].Data = new Tensor(1,1);
+                    outputs[i].Data[0] = (float)(int)out_tensors[i].GetValue();
-                    outputs[i].Data = out_tensors[i].GetValue() as Array;
+                    outputs[i].Data = TensorUtils.ArrayToBarracuda(out_tensors[i].GetValue() as Array);
                }
            }

        private static extern unsafe void TF_OperationGetAttrShape(IntPtr oper, string attr_name, long[] value, 
            int num_dims, IntPtr status);

-        private Tensor GetOpMetadata(TFOperation op)
+        private TensorProxy GetOpMetadata(TFOperation op)
        {
            TFStatus status = new TFStatus();
                        
            if (!status.Ok || shape_attr.TotalSize <= 0)
            {
-                Debug.LogWarning("Operation " + op.Name + " does not contain shape attribute or it" +
-                                 " doesn't contain valid shape data!");
+                Debug.LogWarning($"Operation {op.Name} does not contain shape attribute or it" +
+                                 $" doesn't contain valid shape data! Status: {status.StatusMessage}");
            }
            else
            {
                }
            }

-            Tensor.TensorType placeholder_type = Tensor.TensorType.FloatingPoint;
+            TensorProxy.TensorType placeholder_type = TensorProxy.TensorType.FloatingPoint;
-                    placeholder_type = Tensor.TensorType.FloatingPoint;
+                    placeholder_type = TensorProxy.TensorType.FloatingPoint;
-                    placeholder_type = Tensor.TensorType.Integer;
+                    placeholder_type = TensorProxy.TensorType.Integer;
                    break;
                default:
                    Debug.LogWarning("Operation " + op.Name + 
                        
-            Tensor t = new Tensor
+            TensorProxy t = new TensorProxy
            {
                Data = null,
                Name = op.Name,
            return t;
        }

-        public IReadOnlyList<Tensor> InputFeatures()
+        public IReadOnlyList<TensorProxy> InputFeatures()
-            List<Tensor> inputs = new List<Tensor>();
+            List<TensorProxy> inputs = new List<TensorProxy>();
            foreach (var op in m_graph.GetEnumerator())
            {
                if (op.OpType == "Placeholder")
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs
-#define ENABLE_BARRACUDA
-using System.Collections.Generic;
+using System.Collections.Generic;
+using Barracuda;
-    /// Mapping between the output Tensor names and the method that will use the
+    /// Mapping between the output tensor names and the method that will use the
-    /// This action takes as input the Tensor and the Dictionary of Agent to AgentInfo for
+    /// This action takes as input the tensor and the Dictionary of Agent to AgentInfo for
-        /// A tensor Applier's Execute method takes a Tensor and a Dictionary of Agent to AgentInfo.
-        /// Uses the data contained inside the Tensor to modify the state of the Agent. The Tensors
+        /// A tensor Applier's Execute method takes a tensor and a Dictionary of Agent to AgentInfo.
+        /// Uses the data contained inside the tensor to modify the state of the Agent. The Tensors
-        /// the same way in the dictionary and in the Tensor.
+        /// the same way in the dictionary and in the tensor.
        /// </summary>
        public interface Applier
        {
-            /// <param name="tensor"> The Tensor containing the data to be applied to the Agents</param>
+            /// <param name="tensorProxy"> The Tensor containing the data to be applied to the Agents</param>
-            void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo);
+            void Apply(TensorProxy tensorProxy, Dictionary<Agent, AgentInfo> agentInfo);
        }
        
        Dictionary<string, Applier>  _dict = new Dictionary<string, Applier>();
        /// <param name="bp"> The BrainParameters used to determine what Appliers will be
        /// used</param>
        /// <param name="seed"> The seed the Appliers will be initialized with.</param>
-        public TensorApplier(BrainParameters bp, int seed, object barracudaModel = null)
+        /// <param name="allocator"> Tensor allocator</param>
+        public TensorApplier(BrainParameters bp, int seed, ITensorAllocator allocator, object barracudaModel = null)
        {
            _dict[TensorNames.ValueEstimateOutput] = new ValueEstimateApplier();
            if (bp.vectorActionSpaceType == SpaceType.continuous)
            else
            {
-                _dict[TensorNames.ActionOutput] = new DiscreteActionOutputApplier(
-                    bp.vectorActionSize, seed);
+                _dict[TensorNames.ActionOutput] = new DiscreteActionOutputApplier(bp.vectorActionSize, seed, allocator);
-            
-            #if ENABLE_BARRACUDA
-            Barracuda.Model model = (Barracuda.Model) barracudaModel;
-            for (var i = 0; i < model?.memories.Length; i++)
+            if (barracudaModel != null)
-                _dict[model.memories[i].output] = new BarracudaMemoryOutputApplier(model.memories.Length, i);
+                Model model = (Model) barracudaModel;
+
+                for (var i = 0; i < model?.memories.Length; i++)
+                {
+                    _dict[model.memories[i].output] = new BarracudaMemoryOutputApplier(model.memories.Length, i);
+                }
-            #endif
        }

        /// <summary>
        /// <exception cref="UnityAgentsException"> One of the tensor does not have an
        /// associated applier.</exception>
        public void ApplyTensors(
-            IEnumerable<Tensor> tensors,  Dictionary<Agent, AgentInfo> agentInfos)
+            IEnumerable<TensorProxy> tensors,  Dictionary<Agent, AgentInfo> agentInfos)
        {
            foreach (var tensor in tensors)
            {
-                        "Unknow tensor expected as output : "+tensor.Name);
+                        "Unknow tensorProxy expected as output : "+tensor.Name);
                }
                _dict[tensor.Name].Apply(tensor, agentInfos);
            }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
-#define ENABLE_BARRACUDA
-using System.Collections.Generic;
+using System.Collections.Generic;
+using System.Runtime.InteropServices.ComTypes;
 using Barracuda;

 namespace MLAgents.InferenceBrain
    /// The Action take as argument the tensor, the current batch size and a Dictionary of
    /// Agent to AgentInfo corresponding to the current batch.
    /// Each Generator reshapes and fills the data of the tensor based of the data of the batch.
-    /// When the Tensor is an Input to the model, the shape of the Tensor will be modified
+    /// When the TensorProxy is an Input to the model, the shape of the Tensor will be modified
-    /// When the Tensor is an Output of the model, only the shape of the Tensor will be modified
+    /// When the TensorProxy is an Output of the model, only the shape of the Tensor will be modified
    /// using the current batch size. The data will be prefilled with zeros.
    /// </summary>
    public class TensorGenerator
            /// Modifies the data inside a Tensor according to the information contained in the
            /// AgentInfos contained in the current batch.
            /// </summary>
-            /// <param name="tensor"> The tensor the data and shape will be modified</param>
+            /// <param name="tensorProxy"> The tensor the data and shape will be modified</param>
-            void Generate(Tensor tensor, int batchSize, Dictionary<Agent, AgentInfo> agentInfo);
+            void Generate(TensorProxy tensorProxy, int batchSize, Dictionary<Agent, AgentInfo> agentInfo);
+        ITensorAllocator _allocator;

        /// <summary>
        /// Returns a new TensorGenerators object.
        /// <param name="seed"> The seed the Generators will be initialized with.</param>
-        public TensorGenerator(BrainParameters bp, int seed, object barracudaModel = null)
+        /// <param name="allocator"> Tensor allocator</param>
+        public TensorGenerator(BrainParameters bp, int seed, ITensorAllocator allocator, object barracudaModel = null)
+            _allocator = allocator;
+            
-            _dict[TensorNames.BatchSizePlaceholder] = new BatchSizeGenerator();
-            _dict[TensorNames.SequenceLengthPlaceholder] = new SequenceLengthGenerator();
-            _dict[TensorNames.VectorObservationPlacholder] = new VectorObservationGenerator();
-            _dict[TensorNames.RecurrentInPlaceholder] = new RecurrentInputGenerator();
-            
-            #if ENABLE_BARRACUDA
-            Barracuda.Model model = (Barracuda.Model) barracudaModel;
-            for (var i = 0; i < model?.memories.Length; i++)
+            _dict[TensorNames.BatchSizePlaceholder] = new BatchSizeGenerator(_allocator);
+            _dict[TensorNames.SequenceLengthPlaceholder] = new SequenceLengthGenerator(_allocator);
+            _dict[TensorNames.VectorObservationPlacholder] = new VectorObservationGenerator(_allocator);
+            _dict[TensorNames.RecurrentInPlaceholder] = new RecurrentInputGenerator(_allocator);
+
+            if (barracudaModel != null)
-                _dict[model.memories[i].input] = new BarracudaRecurrentInputGenerator(i);
+                Model model = (Model) barracudaModel;
+                for (var i = 0; i < model?.memories.Length; i++)
+                {
+                    _dict[model.memories[i].input] = new BarracudaRecurrentInputGenerator(i, _allocator);
+                }
-            #endif
-            
-            _dict[TensorNames.PreviousActionPlaceholder] = new PreviousActionInputGenerator();
-            _dict[TensorNames.ActionMaskPlaceholder] = new ActionMaskInputGenerator();
-            _dict[TensorNames.RandomNormalEpsilonPlaceholder] = new RandomNormalInputGenerator(seed);
+
+            _dict[TensorNames.PreviousActionPlaceholder] = new PreviousActionInputGenerator(_allocator);
+            _dict[TensorNames.ActionMaskPlaceholder] = new ActionMaskInputGenerator(_allocator);
+            _dict[TensorNames.RandomNormalEpsilonPlaceholder] = new RandomNormalInputGenerator(seed, _allocator);
            if (bp.cameraResolutions != null)
            {
                for (var visIndex = 0;
                    var index = visIndex;
                    var bw = bp.cameraResolutions[visIndex].blackAndWhite;
                    _dict[TensorNames.VisualObservationPlaceholderPrefix + visIndex] = new
-                            VisualObservationInputGenerator(index, bw);
+                            VisualObservationInputGenerator(index, bw, _allocator);
-            _dict[TensorNames.ActionOutput] = new BiDimensionalOutputGenerator();
-            _dict[TensorNames.RecurrentOutput] = new BiDimensionalOutputGenerator();
-            _dict[TensorNames.ValueEstimateOutput] = new BiDimensionalOutputGenerator();
+            _dict[TensorNames.ActionOutput] = new BiDimensionalOutputGenerator(_allocator);
+            _dict[TensorNames.RecurrentOutput] = new BiDimensionalOutputGenerator(_allocator);
+            _dict[TensorNames.ValueEstimateOutput] = new BiDimensionalOutputGenerator(_allocator);
        }

        /// <summary>
        /// data that will be used to modify the tensors</param>
        /// <exception cref="UnityAgentsException"> One of the tensor does not have an
        /// associated generator.</exception>
-        public void GenerateTensors(IEnumerable<Tensor> tensors, 
+        public void GenerateTensors(IEnumerable<TensorProxy> tensors, 
            int currentBatchSize, 
            Dictionary<Agent, AgentInfo> agentInfos)
        {
                {
                    throw new UnityAgentsException(
-                        "Unknow tensor expected as input : " + tensor.Name);
+                        "Unknow tensorProxy expected as input : " + tensor.Name);
                }
                _dict[tensor.Name].Generate(tensor, currentBatchSize, agentInfos);
            }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorNames.cs
 namespace MLAgents.InferenceBrain
 {
    /// <summary>
-    /// Contains the names of the input and output Tensor for the Inference Brain.
+    /// Contains the names of the input and output tensors for the Inference Brain.
    /// </summary>
    public static class TensorNames
    {
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/Multinomial.cs
        /// <exception cref="NotImplementedException">Multinomial doesn't support integer tensors</exception>
        /// <exception cref="ArgumentException">Issue with tensor shape or type</exception>
        /// <exception cref="ArgumentNullException">At least one of the tensors is not allocated</exception>
-        public void Eval(Tensor src, Tensor dst)
+        public void Eval(TensorProxy src, TensorProxy dst)
        {
            if (src.DataType != typeof(float))
            {
                throw new ArgumentNullException();
            }

-            float[,] input_data = src.Data as float[,];
-            if (input_data == null)
-            {
-                throw new ArgumentException("Input data is not of the correct shape! Required batch x logits");
-            }
-            float[,] output_data = dst.Data as float[,];
-            if (output_data == null)
-            {
-                throw new ArgumentException("Output data is not of the correct shape! Required batch x samples");
-            }
-
-            if (input_data.GetLength(0) != output_data.GetLength(0))
+            if (src.Data.batch != dst.Data.batch)
-            float[] cdf = new float[input_data.GetLength(1)];
+            float[] cdf = new float[src.Data.channels];
-            for (int batch = 0; batch < input_data.GetLength(0); ++batch)
+            for (int batch = 0; batch < src.Data.batch; ++batch)
-                for (int cls = 0; cls < input_data.GetLength(1); ++cls)
+                for (int cls = 0; cls < src.Data.channels; ++cls)
-                    maxProb = Mathf.Max(input_data[batch, cls], maxProb);
+                    maxProb = Mathf.Max(src.Data[batch, cls], maxProb);
-                for (int cls = 0; cls < input_data.GetLength(1); ++cls)
+                for (int cls = 0; cls < src.Data.channels; ++cls)
-                    sumProb += Mathf.Exp(input_data[batch, cls] - maxProb);
+                    sumProb += Mathf.Exp(src.Data[batch, cls] - maxProb);
-                for (int sample = 0; sample < output_data.GetLength(1); ++sample)
+                for (int sample = 0; sample < dst.Data.channels; ++sample)
                {
                    float p = (float)m_random.NextDouble() * sumProb;
                    int cls = 0;
                    }

-                    output_data[batch, sample] = cls;
+                    dst.Data[batch, sample] = cls;
                }

            }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Utils/RandomNormal.cs
 using System;
+using UnityEngine;

 namespace MLAgents.InferenceBrain.Utils
 {
            return v * s * m_stddev + m_mean;
        }

-        private void IncreaseNextDim(Array arr, long[] indices)
-        {
-            for (int i = 1; i < arr.Rank; ++i)
-            {
-                ++indices[i];
-                if (i == arr.Rank - 1 || indices[i] < arr.GetLength(i))
-                {
-                    break;
-                }
-                else
-                {
-                    indices[i] = 0;
-                }
-            }
-        }
-
        /// <summary>
        /// Fill a pre-allocated Tensor with random numbers
        /// </summary>
-        public void FillTensor(Tensor t)
+        public void FillTensor(TensorProxy t)
        {
            if (t.DataType != typeof(float))
            {
                throw new ArgumentNullException();
            }

-            long[] indices = new long[t.Data.Rank];
-
-            // Since IEnumerable is const, and we don't know the dimentions of the Array
-            // we need to traverse all the dimentions
-            // TODO: this seems like a nice general operation for the Tensor, consider moving it there
-            do
-            {
-                t.Data.SetValue((float) NextDouble(), indices);
-                ++indices[0];
-                if (indices[0] == t.Data.GetLength(0))
-                {
-                    indices[0] = 0;
-                    IncreaseNextDim(t.Data, indices);
-                }
-            } while (indices[t.Data.Rank - 1] < t.Data.GetLength(t.Data.Rank - 1));
+            for (int i = 0; i < t.Data.length; i++)
+                t.Data[i] = (float)NextDouble();
        }
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
-#define ENABLE_BARRACUDA
-
-using System;
+using System;
 using System.Collections.Generic;
 using UnityEngine;
 using System.Linq;
-using Tensor = MLAgents.InferenceBrain.Tensor;

 namespace MLAgents
 {
    [CreateAssetMenu(fileName = "NewLearningBrain", menuName = "ML-Agents/Learning Brain")]
    public class LearningBrain : Brain
    {
+        private ITensorAllocator _tensorAllocator;
        private TensorGenerator _tensorGenerator;
        private TensorApplier _tensorApplier;
 #if ENABLE_TENSORFLOW
-#elif ENABLE_BARRACUDA 
+#else 
        public NNModel model;
        private Model _barracudaModel;
        private IWorker _engine;
        private string[] _outputNames;
 #endif
+        
-        private IReadOnlyList<Tensor> _inferenceInputs;
-        private IReadOnlyList<Tensor> _inferenceOutputs;
+        private IReadOnlyList<TensorProxy> _inferenceInputs;
+        private IReadOnlyList<TensorProxy> _inferenceOutputs;

        [NonSerialized]
        private bool _isControlled;
        /// </exception>
        public void ReloadModel(int seed = 0)
        {
+            if (_tensorAllocator == null)
+                _tensorAllocator = new TensorCachingAllocator();
+            
 #if ENABLE_TENSORFLOW
            if (model != null)
            {
            _modelParamLoader = ModelParamLoader.GetLoaderAndCheck(_engine, brainParameters);
            _inferenceInputs = _modelParamLoader.GetInputTensors();
            _inferenceOutputs = _modelParamLoader.GetOutputTensors();
-            _tensorGenerator = new TensorGenerator(brainParameters, seed);
-            _tensorApplier = new TensorApplier(brainParameters, seed);
-#elif ENABLE_BARRACUDA
+            _tensorGenerator = new TensorGenerator(brainParameters, seed, _tensorAllocator);
+            _tensorApplier = new TensorApplier(brainParameters, seed, _tensorAllocator);
+#else
            if (model != null)
            {
                #if BARRACUDA_VERBOSE
            _modelParamLoader = BarracudaModelParamLoader.GetLoaderAndCheck(_engine, _barracudaModel, brainParameters);
            _inferenceInputs = _modelParamLoader.GetInputTensors();
            _outputNames = _modelParamLoader.GetOutputNames();
-            _tensorGenerator = new TensorGenerator(brainParameters, seed, _barracudaModel);
-            _tensorApplier = new TensorApplier(brainParameters, seed, _barracudaModel);
+            _tensorGenerator = new TensorGenerator(brainParameters, seed, _tensorAllocator, _barracudaModel);
+            _tensorApplier = new TensorApplier(brainParameters, seed, _tensorAllocator, _barracudaModel);
 #endif
        }
        

 #if ENABLE_TENSORFLOW
            return (_modelParamLoader != null) ? _modelParamLoader.GetChecks() : new List<string>();
-#elif ENABLE_BARRACUDA
+#else
-#else
-            return new List<string>(){
-                "You need to install the TensorflowSharp plugin and add the ENABLE_TENSORFLOW " +
-                "flag in your Player Settings in order to use inference. "};
 #endif
        }

            {
                return;
            }
+            
+            Profiler.BeginSample("LearningBrain.DecideAction");
+            
 #if ENABLE_TENSORFLOW
            if (_engine == null)
            {

            // Update the outputs
            _tensorApplier.ApplyTensors(_inferenceOutputs, agentInfos);
-#elif ENABLE_BARRACUDA
+#else
            if (_engine == null)
            {
                Debug.LogError($"No model was present for the Brain {name}.");
+            Profiler.BeginSample($"MLAgents.{name}.GenerateTensors");
+            Profiler.EndSample();
+            Profiler.BeginSample($"MLAgents.{name}.PrepareBarracudaInputs");
+            Profiler.EndSample();

            // Execute the Model
            Profiler.BeginSample($"MLAgents.{name}.ExecuteGraph");
+            Profiler.BeginSample($"MLAgents.{name}.FetchBarracudaOutputs");
-            CleanupBarracudaState(inputs);
+            Profiler.EndSample();
+            Profiler.BeginSample($"MLAgents.{name}.ApplyTensors");
-#else
-            if (agentInfos.Count > 0)
-            {
-                Debug.LogError(string.Format(
-                    "The brain {0} was set to inference mode but the Tensorflow library is not " +
-                    "present in the Unity project.",
-                    name));
-            }
+            Profiler.EndSample();
+            Profiler.EndSample();
-#if ENABLE_BARRACUDA && !ENABLE_TENSORFLOW
-        protected Dictionary<string, Barracuda.Tensor> PrepareBarracudaInputs(IEnumerable<Tensor> infInputs)
+#if !ENABLE_TENSORFLOW
+        protected Dictionary<string, Tensor> PrepareBarracudaInputs(IEnumerable<TensorProxy> infInputs)
-            var inputs = new Dictionary<string, Barracuda.Tensor>();
+            var inputs = new Dictionary<string, Tensor>();
-                inputs[inp.Name] = BarracudaUtils.ToBarracuda(inp);
+                inputs[inp.Name] = inp.Data;
-        protected List<Tensor> FetchBarracudaOutputs(string[] names)
+        protected List<TensorProxy> FetchBarracudaOutputs(string[] names)
-            var outputs = new List<Tensor>();
+            var outputs = new List<TensorProxy>();
-                outputs.Add(BarracudaUtils.FromBarracuda(outp, name));
+                outputs.Add(TensorUtils.TensorProxyFromBarracuda(outp, name));
-
-        protected void CleanupBarracudaState(Dictionary<string, Barracuda.Tensor> inputs)
-        {
-            foreach (var key in inputs.Keys)
-            {
-                inputs[key].Dispose();
-            }
-            inputs.Clear();
-        }
-
+#endif
+        
+#if !ENABLE_TENSORFLOW
+#endif
+            _tensorAllocator?.Reset(false);
-#endif
+
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/Utilities.cs
 using UnityEngine;
 using System.Collections;
 using System.Collections.Generic;
+using Barracuda;
+using MLAgents.InferenceBrain;

 namespace MLAgents
 {
        /// Converts a list of Texture2D into a Tensor.
        /// </summary>
-        /// <returns>
-        /// A 4 dimensional float Tensor of dimension
-        /// [batch_size, height, width, channel].
-        /// Where batch_size is the number of input textures,
-        /// height corresponds to the height of the texture,
-        /// width corresponds to the width of the texture,
-        /// channel corresponds to the number of channels extracted from the
-        /// input textures (based on the input blackAndWhite flag
-        /// (3 if the flag is false, 1 otherwise).
-        /// The values of the Tensor are between 0 and 1.
-        /// </returns>
+        /// <param name="tensorProxy">
+        /// Tensor proxy to fill with Texture data.
+        /// </param>
        /// <param name="textures">
        /// The list of textures to be put into the tensor.
        /// Note that the textures must have same width and height.
        /// will be converted to grayscale before being stored in the tensor.
        /// </param>
-        public static float[,,,] TextureToFloatArray(List<Texture2D> textures, bool blackAndWhite)
+        /// <param name="allocator">Tensor allocator</param>
+        public static void TextureToTensorProxy(TensorProxy tensorProxy, List<Texture2D> textures, bool blackAndWhite, 
+                                                                ITensorAllocator allocator)
-            var pixels = blackAndWhite ? 1 : 3;
-            var result = new float[batchSize, height, width, pixels];
+            var data = tensorProxy.Data;

            for (var b = 0; b < batchSize; b++)
            {
                        {
                            // For Color32, the r, g and b values are between
                            // 0 and 255.
-                            result[b, h, w, 0] = currentPixel.r / 255.0f;
-                            result[b, h, w, 1] = currentPixel.g / 255.0f;
-                            result[b, h, w,2] = currentPixel.b / 255.0f;
+                            data[b, h, w, 0] = currentPixel.r / 255.0f;
+                            data[b, h, w, 1] = currentPixel.g / 255.0f;
+                            data[b, h, w,2] = currentPixel.b / 255.0f;
-                            result[b, h, w, 0] = (currentPixel.r + currentPixel.g + currentPixel.b)
+                            data[b, h, w, 0] = (currentPixel.r + currentPixel.g + currentPixel.b)
-            return result;
        }
        
        
                result[actionIndex + 1] = runningSum;
            }
            return result;
+        }
+
+        /// <summary>
+        /// Shifts list elements to the left by the specified amount.
+        /// <param name="list">
+        /// Target list
+        /// </param>
+        /// <param name="amount">
+        /// Shift amount
+        /// </param>
+        /// </summary>
+        public static void ShiftLeft<T>(List<T> list, int amount)
+        {
+            for (var i = amount; i < list.Count; i++)
+            {
+                list[i - amount] = list[i];
+            }
+        }
+
+        /// <summary>
+        /// Replaces target list elements with source list elements starting at specified position in target list.
+        /// <param name="dst">
+        /// Target list
+        /// </param>
+        /// <param name="src">
+        /// Source list
+        /// </param>
+        /// <param name="start">
+        /// Offset in target list
+        /// </param>
+        /// </summary>
+        public static void ReplaceRange<T>(List<T> dst, List<T> src, int start)
+        {
+            for (var i = 0; i < src.Count; i++)
+            {
+                dst[i + start] = src[i];
+            }
+        }
+
+        
+        /// <summary>
+        /// Adds elements to list without extra temp allocations (assuming it fits pre-allocated capacity of the list).
+        /// Regular List<T>.AddRange() unfortunately allocates temp list to add items.
+        /// https://stackoverflow.com/questions/2123161/listt-addrange-implementation-suboptimal
+        /// Note: this implementation might be slow with large numbers of elements in the source array.
+        /// <param name="dst">
+        /// Target list
+        /// </param>
+        /// <param name="src">
+        /// Source array
+        /// </param>
+        /// </summary>
+        public static void AddRangeNoAlloc<T>(List<T> dst, T[] src)
+        {
+            var offset = dst.Count;
+            
+            for (var i = 0; i < src.Length; i++)
+            {
+                dst.Add(src[i]);
+            }
        }
    }
 }
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using Barracuda;
+using UnityEngine;
+
+namespace MLAgents.InferenceBrain
+{
+
+	/// <summary>
+	/// Tensor - A class to encapsulate a Tensor used for inference.
+	/// 
+	/// This class contains the Array that holds the data array, the shapes, type and the placeholder in the
+	/// execution graph. All the fields are editable in the inspector, allowing the user to specify everything
+	/// but the data in a graphical way.
+	/// </summary>
+	[System.Serializable]
+	public class TensorProxy
+	{
+		public enum TensorType
+		{
+			Integer,
+			FloatingPoint
+		};
+
+		private static Dictionary<TensorType, Type> m_typeMap = new Dictionary<TensorType, Type>()
+		{
+			{ TensorType.FloatingPoint, typeof(float)},
+			{TensorType.Integer, typeof(int)}
+		};
+
+		public string Name;
+		public TensorType ValueType;
+		// Since Type is not serializable, we use the DisplayType for the Inspector
+		public Type DataType
+		{
+			get { return m_typeMap[ValueType]; }
+		}
+		public long[] Shape;
+		
+		public Tensor Data;
+	}
+	
+	public class TensorUtils
+	{
+		public static void ResizeTensor(TensorProxy tensor, int batch, ITensorAllocator allocator)
+		{
+			if (tensor.Shape[0] == batch &&
+			    tensor.Data != null && tensor.Data.batch == batch)
+				return; 
+
+			tensor.Data?.Dispose();
+			tensor.Shape[0] = batch;
+			
+			if (tensor.Shape.Length == 4)
+				tensor.Data = allocator.Alloc(new TensorShape(batch, (int)tensor.Shape[1], (int)tensor.Shape[2], (int)tensor.Shape[3]));
+			else
+				tensor.Data = allocator.Alloc(new TensorShape(batch, (int)tensor.Shape[tensor.Shape.Length - 1]));
+		}
+
+		public static Array BarracudaToFloatArray(Tensor tensor)
+		{
+			Array res;
+			
+			if (tensor.height == 1 && tensor.width == 1)
+				res = new float[tensor.batch, tensor.channels];
+			else
+				res = new float[tensor.batch, tensor.height, tensor.width, tensor.channels];
+			
+			Buffer.BlockCopy(tensor.readonlyArray, 0, res, 0, tensor.length * Marshal.SizeOf<float>());
+
+			return res;
+		}
+		
+		public static Array BarracudaToIntArray(Tensor tensor)
+		{
+
+			if (tensor.height == 1 && tensor.width == 1)
+			{
+				var res = new int[tensor.batch, tensor.channels];
+				
+				for (int b = 0; b < tensor.batch; b++)
+				for (int c = 0; c < tensor.channels; c++)
+				{
+					res[b, c] = (int)tensor[b, c];
+				}
+
+				return res;
+			}
+			else
+			{
+				var res = new int[tensor.batch, tensor.height, tensor.width, tensor.channels];
+				for (int b = 0; b < tensor.batch; b++)
+				for (int y = 0; y < tensor.height; y++)
+				for (int x = 0; x < tensor.width; x++)
+				for (int c = 0; c < tensor.channels; c++)
+				{
+					res[b, y, x, c] = (int)tensor[b, y, x, c];
+				}
+
+				return res;
+			}
+		}
+
+		public static Tensor ArrayToBarracuda(Array array)
+		{
+			Tensor res;
+			
+			if (array.Rank == 2)
+				res = new Tensor(array.GetLength(0), array.GetLength(1));
+			else
+				res = new Tensor(array.GetLength(0), array.GetLength(1), array.GetLength(2), array.GetLength(3));
+
+			int offset = 0;
+			var barracudaArray = res.data != null ? res.tensorOnDevice.SharedAccess(out offset) : null;
+
+			Buffer.BlockCopy(array, 0, barracudaArray, offset, res.length * Marshal.SizeOf<float>());
+			
+			return res;
+		}
+
+		internal static long[] TensorShapeFromBarracuda(TensorShape src)
+		{
+			if (src.height == 1 && src.width == 1)
+				return new long[2] {src.batch, src.channels};
+
+			return new long[4] {src.batch, src.height, src.width, src.channels};
+		}
+
+		public static TensorProxy TensorProxyFromBarracuda(Tensor src, string nameOverride = null)
+		{
+			var shape = TensorShapeFromBarracuda(src.shape);
+			return new TensorProxy
+			{
+				Name = nameOverride ?? src.name,
+				ValueType = TensorProxy.TensorType.FloatingPoint,
+				Shape = shape,
+				Data = src
+			};
+		}
+	}
+
+}
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/Tensor.cs
-using System;
-using System.Collections.Generic;
-
-namespace MLAgents.InferenceBrain
-{
-
-	/// <summary>
-	/// Tensor - A class to encapsulate a Tensor used for inference.
-	/// 
-	/// This class contains the Array that holds the data array, the shapes, type and the placeholder in the
-	/// execution graph. All the fields are editable in the inspector, allowing the user to specify everything
-	/// but the data in a graphical way.
-	/// </summary>
-	[System.Serializable]
-	public class Tensor
-	{
-		public enum TensorType
-		{
-			Integer,
-			FloatingPoint
-		};
-
-		private static Dictionary<TensorType, Type> m_typeMap = new Dictionary<TensorType, Type>()
-		{
-			{ TensorType.FloatingPoint, typeof(float)},
-			{TensorType.Integer, typeof(int)}
-		};
-
-		public string Name;
-		public TensorType ValueType;
-		// Since Type is not serializable, we use the DisplayType for the Inspector
-		public Type DataType
-		{
-			get { return m_typeMap[ValueType]; }
-		}
-		public long[] Shape;
-		public Array Data;
-	}
-
-}
--- a//UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs.meta
+++ b//UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorProxy.cs.meta