First stage of ML Agents update to Barracuda 0.2.x

6 年前 · 27567062
--- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
+++ b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
        m_Texture: {fileID: 0}
        m_Scale: {x: 1, y: 1}
        m_Offset: {x: 0, y: 0}
+    - _SpecGlossMap:
+        m_Texture: {fileID: 0}
+        m_Scale: {x: 1, y: 1}
+        m_Offset: {x: 0, y: 0}
    m_Floats:
    - _BumpScale: 1
    - _Cutoff: 0.5
    m_Colors:
    - _Color: {r: 0.10980392, g: 0.6039216, b: 1, a: 1}
    - _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
+    - _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}
--- a/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
+++ b/UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
        m_Texture: {fileID: 0}
        m_Scale: {x: 1, y: 1}
        m_Offset: {x: 0, y: 0}
+    - _SpecGlossMap:
+        m_Texture: {fileID: 0}
+        m_Scale: {x: 1, y: 1}
+        m_Offset: {x: 0, y: 0}
    m_Floats:
    - _BumpScale: 1
    - _Cutoff: 0.5
    m_Colors:
    - _Color: {r: 0.5, g: 0.5, b: 0.5, a: 1}
    - _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
+    - _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}
--- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
+++ b/UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
  userData: 
  assetBundleName: 
  assetBundleVariant: 
-  script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
+  script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
 ### Load Model into Barracuda
 Once you have your TensorFlow (or ONNX) model converted, you can load resulting Barracuda file via `ModelLoader`:
 ```C#
-var model = ModelLoader.LoadFromStreamingAssets(modelName + ".bytes");
+var model = ModelLoader.LoadFromStreamingAssets(modelName + ".nn");
+```
+Another option is to use editor model importer. Just add public `NNModel` field to your C# script and assing ``.nn`` model file via editor UI:
+```C#
+public NNModel modelSource;
+<..>
+var model = ModelLoader.Load(modelSource);
-var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputeFast, model)
+var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputePrecompiled, model)
 ```

 ### Execute the model
 Execution is asynchronous for GPU backends. Currently implementation is synchronous for CPU backends, however it is good to assume that execution will be async for all backends in the future.

 ### Fetch outputs
-If model has only single output, then simple `worker.Fetch()` can be used, otherwise output names should be provided.
+If model has only single output, then simple `worker.Peek()` can be used, otherwise output names should be provided.
-var O = worker.Fetch(outputName);
+var O = worker.Peek(outputName);
+_Note:_ ``Peek()`` does not take ownership of the tensor. If you expect to keep tensor for longer time use ``Fetch()``

 ### Cleanup
 As a Barracuda client you are responsible to `Dispose` _worker_, _inputs_ and _outputs_ you fetched. This is necessary to properly free GPU resources.
 ### Texture as output
 If you want to use Barracuda execution results further in the graphics pipeline, you can copy data from `Tensor` into `RenderTexture` without stalling CPU or GPU:
 ```C#
-	var tensor = worker.Fetch();
+	var tensor = worker.Peek();
 	var texture = BarracudaTextureUtils.TensorToRenderTexture(tensor);
 ```
 If you wish, you can reuse the same `RenderTexture` multiple times:
-	var tensor = worker.Fetch();
+	var tensor = worker.Peek();
 	BarracudaTextureUtils.TensorToRenderTexture(tensor, texture);
 ```


 Convert from TensorFlow:
 ```bash
-python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.bytes
+python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.nn
-python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.bytes
+python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.nn
 ```

 If network has multiple outputs, but you need only particular ones during the inference, there is an optional `-trim` flag to remove unused outputs and calculations.
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
+#pragma kernel Relu_Flat
+#pragma kernel Relu_Loop
+#pragma kernel Relu6_Flat
+#pragma kernel Relu6_Loop
+#pragma kernel Tanh_Flat
+#pragma kernel Tanh_Loop
+#pragma kernel Swish_Flat
+#pragma kernel Swish_Loop
+#pragma kernel Sigmoid_Flat
+#pragma kernel Sigmoid_Loop
+#pragma kernel Elu_Flat
+#pragma kernel Elu_Loop
+#pragma kernel LeakyRelu_Flat
+#pragma kernel LeakyRelu_Loop
+#pragma kernel Exp_Flat
+#pragma kernel Exp_Loop
+#pragma kernel Log_Flat
+#pragma kernel Log_Loop
+#pragma kernel Pow_Flat
+#pragma kernel Pow_Loop
+
+/*
+Relu_Flat (NEW) vs Relu_Nyxc+Relu_CNyx+Relu 
+Compute Precompiled
+
+VGG@1
+<<<Exec #128:  59.6 ms, cpu: .9 ms, avg:  62.4 ms, result:OK    <--- NEW!
+<<<Exec #128:  63.6 ms, cpu: .9 ms, avg:  64.0 ms, result:OK
+
+VGG@4
+<<<Exec #16: 276.7 ms, cpu: .9 ms, avg: 272.8 ms, result:OK     <--- NEW!
+<<<Exec #16: 297.5 ms, cpu: .9 ms, avg: 274.4 ms, result:OK
+
+RES@1
+<<<Exec #100:  82.2 ms, cpu: 22.2 ms, avg:  81.0 ms, result:OK  <--- NEW!
+<<<Exec #100:  82.1 ms, cpu: 22.5 ms, avg:  85.4 ms, result:OK
+
+PPO_2@256
+<<<Exec #200:  10.3 ms, cpu: 7.6 ms, avg:  11.9 ms, result:OK   <--- NEW!
+<<<Exec #200:  10.9 ms, cpu: 8.3 ms, avg:  12.3 ms, result:OK
+
+PPO_CNN@256
+<<<Exec #100:  60.6 ms, cpu: 62.3 ms, avg:  65.6 ms, result:OK  <--- NEW!
+<<<Exec #100:  72.6 ms, cpu: 62.7 ms, avg:  66.0 ms, result:OK
+*/
+
 #pragma kernel Relu
 #pragma kernel Relu_CNyx
 #pragma kernel Relu_Nyxc
 #pragma kernel Exp
 #pragma kernel Exp_CNyx
 #pragma kernel Exp_Nyxc
+#pragma kernel Log
+#pragma kernel Log_CNyx
+#pragma kernel Log_Nyxc
 #pragma kernel Pow
 #pragma kernel Pow_CNyx
 #pragma kernel Pow_Nyxc
 TENSOR_DECL_RW(O)

 float _Alpha;
+uint _LoopStride;
+
+#define FLAT_ACTIVATION(name, op_name) \
+void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
+{\
+    DISPATCH_ARGS(O.length, 1, 1)\
+    TENSOR_ARGS2(X, O);\
+\
+    uint i = dispatchThreadID.x;\
+    if (i > O.GetLength()) return;\
+\
+    float v = X.Get(i);\
+    v = op_name (v);\
+    O.Set(i, v);\
+}
+
+#define LOOP_ACTIVATION(name, op_name) \
+void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
+{\
+    DISPATCH_ARGS(O.length, 1, 1)\
+    TENSOR_ARGS2(X, O);\
+\
+    uint i = dispatchThreadID.x;\
+    uint len = O.GetLength();\
+\
+    while (i < len) {\
+        float v = X.Get(i); \
+        v = op_name (v); \
+        O.Set(i, v); \
+        i += _LoopStride; \
+    }\
+}
+
+#define ACTIVATION(name, op_name) \
+NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
+FLAT_ACTIVATION(name, op_name)\
+NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
+LOOP_ACTIVATION(name, op_name)
-	return 0.5f * (v + abs(v));
+    return 0.5f * (v + abs(v));
-	return min(max(0, v), 6);
+    return min(max(0, v), 6);
-	return v / (1.f + exp(-v));
+    return v / (1.f + exp(-v));
-	return 1.f / (1.f + exp(-v));
+    return 1.f / (1.f + exp(-v));
-	if (v <= 0)
-		v = _Alpha * (exp(v) - 1);
-	return v;
+    if (v <= 0)
+        v = _Alpha * (exp(v) - 1);
+    return v;
-	return max(v, _Alpha * v);	
+    return max(v, _Alpha * v);    
-float signed_pow(float f, float e)
+float signed_pow(float f)
-	// handle negative f
-	float v = pow(abs(f), e);
-	float s = (e % 2 == 1) ?
-		sign(f):	// exponent is odd  => sign(f) * pow(abs(f), e)
-		1;			// exponent is even => pow(abs(f), e)
-	return v * s;
+    float e = _Alpha;
+    
+    // handle negative f
+    float v = pow(abs(f), e);
+    float s = (e % 2 == 1) ?
+        sign(f):    // exponent is odd  => sign(f) * pow(abs(f), e)
+        1;            // exponent is even => pow(abs(f), e)
+    return v * s;
+ACTIVATION(Relu, relu)
+ACTIVATION(Relu6, relu6)
+ACTIVATION(Tanh, tanh)
+ACTIVATION(Sigmoid, sigmoid)
+ACTIVATION(Swish, swish)
+ACTIVATION(Elu, elu)
+ACTIVATION(LeakyRelu, lrelu)
+ACTIVATION(Exp, exp)
+ACTIVATION(Log, log)
+ACTIVATION(Pow, signed_pow)
+
+// -------------------
+
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = relu(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = relu(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = relu6(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = relu6(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = tanh(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = tanh(v);
+        O.Set(n, y, x, c, v);
+    }
- 	DISPATCH_ARGS(O.channels, O.width, O.height);
- 	TENSOR_ARGS2(X, O);
+     DISPATCH_ARGS(O.channels, O.width, O.height);
+     TENSOR_ARGS2(X, O);
- 	uint c = dispatchThreadID.x;
- 	uint x = dispatchThreadID.y;
- 	uint y = dispatchThreadID.z;
+     uint c = dispatchThreadID.x;
+     uint x = dispatchThreadID.y;
+     uint y = dispatchThreadID.z;
- 	if (c >= O.channels) return;
- 	if (x >= O.width) return;
- 	if (y >= O.height) return;
+     if (c >= O.channels) return;
+     if (x >= O.width) return;
+     if (y >= O.height) return;
- 	for (uint n = 0; n < X.batch; ++n)
- 	{
- 		float v = X.Get(n, y, x, c);
- 		v = sigmoid(v);
- 		O.Set(n, y, x, c, v);
- 	}
+     for (uint n = 0; n < X.batch; ++n)
+     {
+         float v = X.Get(n, y, x, c);
+         v = sigmoid(v);
+         O.Set(n, y, x, c, v);
+     }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = swish(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = swish(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = elu(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = elu(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = lrelu(v);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = lrelu(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = exp(v);
+        O.Set(n, y, x, c, v);
+    }
+}
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void Log(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = exp(v);
-		O.Set(n, y, x, c, v);
-	}
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = log(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;        if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = signed_pow(v, _Alpha);
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = signed_pow(v);
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = relu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = relu(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = relu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = relu(v);
+    O.Set(n, y, x, c, v);
+
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = relu6(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = relu6(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = relu6(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = relu6(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = tanh(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = tanh(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = tanh(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = tanh(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = sigmoid(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = sigmoid(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = sigmoid(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = sigmoid(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = swish(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = swish(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = swish(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = swish(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = elu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = elu(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = elu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = elu(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = lrelu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = lrelu(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = lrelu(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = lrelu(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = exp(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = exp(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
+
+    uint nyxc = dispatchThreadID.x;
+
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
+
+    if (n >= X.batch) return;
+
+    float v = X.Get(n, y, x, c);
+    v = exp(v);
+    O.Set(n, y, x, c, v);
+}
+
+NUMTHREADS((16,16,1), (16,8,1), (16,4,1))
+void Log_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
+
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
+
+    float v = X.Get(n, y, x, c);
+    v = log(v);
+    O.Set(n, y, x, c, v);
+}
+
+NUMTHREADS((512,1,1), (128,1,1), (64,1,1))
+void Log_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
+
+    uint nyxc = dispatchThreadID.x;
+
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = exp(v);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = log(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = signed_pow(v, _Alpha);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = signed_pow(v);
+    O.Set(n, y, x, c, v);
-	DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
+    TENSOR_ARGS2(X, O);
-	uint nyxc = dispatchThreadID.x;
+    uint nyxc = dispatchThreadID.x;
-	uint c = nyxc % X.channels;
-	uint nyx = nyxc / X.channels;
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+    uint c = nyxc % X.channels;
+    uint nyx = nyxc / X.channels;
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
-	if (n >= X.batch) return;
+    if (n >= X.batch) return;
-	float v = X.Get(n, y, x, c);
-	v = signed_pow(v, _Alpha);
-	O.Set(n, y, x, c, v);
+    float v = X.Get(n, y, x, c);
+    v = signed_pow(v);
+    O.Set(n, y, x, c, v);
 }


-	DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
+    TENSOR_ARGS2(X, O);
-	uint x = dispatchThreadID.x;
-	uint y = dispatchThreadID.y;
+    uint x = dispatchThreadID.x;
+    uint y = dispatchThreadID.y;
-	if (x >= O.GetFlatWidth()) return;
-	if (y >= O.GetFlatHeight()) return;
+    if (x >= O.GetFlatWidth()) return;
+    if (y >= O.GetFlatHeight()) return;
-	float maxV = -FLT_MAX;
-	for (uint i = 0; i < X.GetFlatWidth(); ++i)
-	{
-		float v = X.Get(y, i);
-		if (v > maxV)
-			maxV = v;
-	}
+    float maxV = -FLT_MAX;
+    for (uint i = 0; i < X.GetFlatWidth(); ++i)
+    {
+        float v = X.Get(y, i);
+        if (v > maxV)
+            maxV = v;
+    }
-	float acc = 0.0f;
-	for (i = 0; i < X.GetFlatWidth(); ++i)
-	{
-		float v = X.Get(y, i);
-		acc += exp(v - maxV);
-	}
+    float acc = 0.0f;
+    for (i = 0; i < X.GetFlatWidth(); ++i)
+    {
+        float v = X.Get(y, i);
+        acc += exp(v - maxV);
+    }
-	float v = X.Get(y, x);
-	v = exp(v - maxV) / acc;
-	O.Set(y, x, v);
+    float v = X.Get(y, x);
+    v = exp(v - maxV) / acc;
+    O.Set(y, x, v);
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastAdd(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastSub(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastMul(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < O.batch; ++n)
    {
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastDiv(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {

 float signed_pow(float f, float e)
 {
-	// handle negative f
-	float v = pow(abs(f), e);
-	float s = (e % 2 == 1) ?
-		sign(f):	// exponent is odd  => sign(f) * pow(abs(f), e)
-		1;			// exponent is even => pow(abs(f), e)
-	return v * s;
+    // handle negative f
+    float v = pow(abs(f), e);
+    float s = (e % 2 == 1) ?
+        sign(f):    // exponent is odd  => sign(f) * pow(abs(f), e)
+        1;            // exponent is even => pow(abs(f), e)
+    return v * s;
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastMin(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void BroadcastMax(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS3(X, B, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS3(X, B, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;	if (x >= O.width) return;		if (y >= O.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= O.channels) return;    if (x >= O.width) return;       if (y >= O.height) return;

    for (uint n = 0; n < X.batch; ++n)
    {
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
 #pragma kernel Conv2D
 #pragma kernel Conv2D_RegisterBlock4x2
-//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
+#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4

 #pragma kernel DepthwiseConv2D

 NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
 void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	uint k = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (k >= K.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (k >= K.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	uint2 leftCorner = _Pad.xy;
-	uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		float acc = B.Get(k);
-		for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-		{
-			for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-			{
-				uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
-				// @TODO: investigate
-				// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-				if (any(pos < leftCorner)) continue;
-				if (any(pos >= rightCorner)) continue;
+    uint2 leftCorner = _Pad.xy;
+    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float acc = B.Get(k);
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+        {
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+            {
+                uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
+                // @TODO: investigate
+                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+                if (any(pos < leftCorner)) continue;
+                if (any(pos >= rightCorner)) continue;
-				for (uint c = 0; c < X.channels; ++c)
-					acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c),  K.Get(dy, dx, c, k), acc);
-			}
-		}
+                for (uint c = 0; c < X.channels; ++c)
+                    acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c),  K.Get(dy, dx, c, k), acc);
+            }
+        }
-		O.Set(n, y, x, k, acc);
-	}
+        O.Set(n, y, x, k, acc);
+    }
 }


 void Conv2D_RegisterBlock4x2(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	uint k = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (k >= K.channels) return;
-	if (x*SIZE_W >= O.width) return;
-	if (y*SIZE_H >= O.height) return;
+    if (k >= K.channels) return;
+    if (x*SIZE_W >= O.width) return;
+    if (y*SIZE_H >= O.height) return;
-	uint2 leftCorner = _Pad.xy;
-	uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		float acc[SIZE_H*SIZE_W];
-		[unroll]
-		for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-			acc[q] = B.Get(k);
-		for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-		{
-			for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-			{
-				uint2 pos[SIZE_H*SIZE_W];
-				[unroll]
-				for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-					pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
+    uint2 leftCorner = _Pad.xy;
+    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float acc[SIZE_H*SIZE_W];
+        [unroll]
+        for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
+            acc[q] = B.Get(k);
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+        {
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+            {
+                uint2 pos[SIZE_H*SIZE_W];
+                [unroll]
+                for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
+                    pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
-				for (uint c = 0; c < X.channels; ++c)
-					[unroll]
-					for (q = 0; q < SIZE_H*SIZE_W; ++q)
-						if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
-							acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
-			}
-		}
+                for (uint c = 0; c < X.channels; ++c)
+                    [unroll]
+                    for (q = 0; q < SIZE_H*SIZE_W; ++q)
+                        if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
+                            acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
+            }
+        }
-		[unroll]
-		for (q = 0; q < SIZE_H*SIZE_W; ++q)
-			O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
-	}
+        [unroll]
+        for (q = 0; q < SIZE_H*SIZE_W; ++q)
+            O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
+    }
 }
 #undef SIZE_W
 #undef SIZE_H
 [numthreads(L1CACHESIZE, 1, 1)]
 void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
+    #define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
-	uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-	uint x = groupID.y;
-	uint y = groupID.z;
+    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
+    uint x = groupID.y;
+    uint y = groupID.z;
-	// need all threads to load channels, thus will do late check against kernel count
-	if (x*SIZE >= O.width) return;
-	if (y*SIZE >= O.height) return;
+    // need all threads to load channels, thus will do late check against kernel count
+    if (x*SIZE >= O.width) return;
+    if (y*SIZE >= O.height) return;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		float acc[SIZE*SIZE];
-		[unroll]
-		for (uint q = 0; q < SIZE*SIZE; ++q)
-			acc[q] = B.SafeGet(k);
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float acc[SIZE*SIZE];
+        [unroll]
+        for (uint q = 0; q < SIZE*SIZE; ++q)
+            acc[q] = B.SafeGet(k);
-		for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-		{
-			for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-			{
-				uint2 pos[SIZE*SIZE];
-				[unroll]
-				for (uint q = 0; q < SIZE*SIZE; ++q)
-					pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+        {
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+            {
+                uint2 pos[SIZE*SIZE];
+                [unroll]
+                for (uint q = 0; q < SIZE*SIZE; ++q)
+                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-				for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-				{
-					// Cache X
-					uint dc = groupThreadID.x;
-					[unroll]
-					for (q = 0; q < SIZE*SIZE; ++q)
-						X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
-					GroupMemoryBarrierWithGroupSync();
+                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
+                {
+                    // Cache X
+                    uint dc = groupThreadID.x;
+                    [unroll]
+                    for (q = 0; q < SIZE*SIZE; ++q)
+                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
+                    GroupMemoryBarrierWithGroupSync();
-					// X * K
-					if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-					{
-						uint kIndex = K.Index(dy, dx, c, k);
-						for (dc = 0; dc < L1CACHESIZE; ++dc)
-						{
-							[unroll]
-							for (q = 0; q < SIZE*SIZE; ++q)
-								acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
-							kIndex += K.channels;
-						}
-					}
-					GroupMemoryBarrierWithGroupSync();
-				}
-			}
-		}
+                    // X * K
+                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
+                    {
+                        uint kIndex = K.Index(dy, dx, c, k);
+                        for (dc = 0; dc < L1CACHESIZE && (c + dc) < K.GetKernelDepth(); ++dc)
+                        {
+                            [unroll]
+                            for (q = 0; q < SIZE*SIZE; ++q)
+                                acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
+                            kIndex += K.channels;
+                        }
+                    }
+                    GroupMemoryBarrierWithGroupSync();
+                }
+            }
+        }
-		uint remainderW = (O.width - x*SIZE);
-		uint remainderH = (O.height - y*SIZE);
+        uint remainderW = (O.width - x*SIZE);
+        uint remainderH = (O.height - y*SIZE);
-		if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-			[unroll]
-			for (q = 0; q < SIZE*SIZE; ++q)
-				if (q/SIZE < remainderH && q%SIZE < remainderW)
-					O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-	}
+        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
+            [unroll]
+            for (q = 0; q < SIZE*SIZE; ++q)
+                if (q/SIZE < remainderH && q%SIZE < remainderW)
+                    O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
+    }
-	#undef X_
+    #undef X_
 }


-	DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	uint k = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (k >= K.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (k >= K.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	uint2 leftCorner = _Pad.xy;
-	uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
+    uint2 leftCorner = _Pad.xy;
+    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-	uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
-	uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
+    uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
+    uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
-	if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
-	{
-		// path with edge-cases checks
-		for (uint n = 0; n < O.batch; ++n)
-		{
-			float acc = B.Get(k);
-			for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-				for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-				{
-					uint2 pos = leftKernelCorner + uint2(dx, dy);
-					if (any(pos < leftCorner)) continue;
-					if (any(pos >= rightCorner)) continue;
+    if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
+    {
+        // path with edge-cases checks
+        for (uint n = 0; n < O.batch; ++n)
+        {
+            float acc = B.Get(k);
+            for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+                for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+                {
+                    uint2 pos = leftKernelCorner + uint2(dx, dy);
+                    if (any(pos < leftCorner)) continue;
+                    if (any(pos >= rightCorner)) continue;
-					acc = fastfma(
-						X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k), 
-						K.Get(dy, dx, 0, k),
-						acc);
-				}
+                    acc = fastfma(
+                        X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k), 
+                        K.Get(dy, dx, 0, k),
+                        acc);
+                }
-			O.Set(n, y, x, k, acc);
-		}
-	}
-	else
-	{
-		// kernel is guaranteed to be within X,
-		// no need to check against edge-cases
-		leftKernelCorner -= leftCorner;
-		for (uint n = 0; n < O.batch; ++n)
-		{
-			float acc = B.Get(k);
-			for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-				for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-				{
-					uint2 pos = leftKernelCorner + uint2(dx, dy);
+            O.Set(n, y, x, k, acc);
+        }
+    }
+    else
+    {
+        // kernel is guaranteed to be within X,
+        // no need to check against edge-cases
+        leftKernelCorner -= leftCorner;
+        for (uint n = 0; n < O.batch; ++n)
+        {
+            float acc = B.Get(k);
+            for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+                for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+                {
+                    uint2 pos = leftKernelCorner + uint2(dx, dy);
-					acc = fastfma(
-						X.Get(n, pos, k), 
-						K.Get(dy, dx, 0, k),
-						acc);
-				}
+                    acc = fastfma(
+                        X.Get(n, pos, k), 
+                        K.Get(dy, dx, 0, k),
+                        acc);
+                }
-			O.Set(n, y, x, k, acc);
-		}
-	}
+            O.Set(n, y, x, k, acc);
+        }
+    }
 }


 {
-	// NOTE: dispatched over X (not O)
-	DISPATCH_ARGS(K.kernelCount, X.width, X.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    // NOTE: dispatched over X (not O)
+    DISPATCH_ARGS(K.kernelCount, X.width, X.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	uint k = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (k >= K.channels) return;
-	if (x >= X.width) return;
-	if (y >= X.height) return;
+    if (k >= K.channels) return;
+    if (x >= X.width) return;
+    if (y >= X.height) return;
-	uint2 pad = _Pad.xy / _Stride.xy;
-	uint2 leftCorner = pad;
-	uint2 rightCorner = uint2(X.width, X.height) + pad;
+    uint2 pad = _Pad.xy / _Stride.xy;
+    uint2 leftCorner = pad;
+    uint2 rightCorner = uint2(X.width, X.height) + pad;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		for (uint sy = 0; sy < _Stride.y; ++sy)
-		{
-			for (uint sx = 0; sx < _Stride.x; ++sx)
-			{
-				float acc = B.Get(k);
-				for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
-				{
-					for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
-					{
-						uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        for (uint sy = 0; sy < _Stride.y; ++sy)
+        {
+            for (uint sx = 0; sx < _Stride.x; ++sx)
+            {
+                float acc = B.Get(k);
+                for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
+                {
+                    for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
+                    {
+                        uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
-						if (any(pos < leftCorner)) continue;
-						if (any(pos >= rightCorner)) continue;
+                        if (any(pos < leftCorner)) continue;
+                        if (any(pos >= rightCorner)) continue;
-						for (uint c = 0; c < X.channels; ++c)
-						{
-							acc = fastfma(	X.Get(n, pos - leftCorner, c),
-											K.Get(	K.GetKernelHeight() - 1 - dy,
-													K.GetKernelWidth()  - 1 - dx, c, k),
-											acc);
-						}
-					}
-				}
+                        for (uint c = 0; c < X.channels; ++c)
+                        {
+                            acc = fastfma(  X.Get(n, pos - leftCorner, c),
+                                            K.Get(  K.GetKernelHeight() - 1 - dy,
+                                                    K.GetKernelWidth()  - 1 - dx, c, k),
+                                            acc);
+                        }
+                    }
+                }
-				uint oy = y * _Stride.y + sy;
-				uint ox = x * _Stride.x + sx;
-				if (oy < O.height && ox < O.width)
-					O.Set(n, oy, ox, k, acc);
-			}
-		}
-	}
+                uint oy = y * _Stride.y + sy;
+                uint ox = x * _Stride.x + sx;
+                if (oy < O.height && ox < O.width)
+                    O.Set(n, oy, ox, k, acc);
+            }
+        }
+    }
 }

 #undef L1CACHESIZE
 [numthreads(L1CACHESIZE, 1, 1)]
 void Conv2DTrans_L1Cached64_RegisterBlock2x2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	// NOTE: dispatched over X (not O)
-	DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    // NOTE: dispatched over X (not O)
+    DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
+    #define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
-	uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-	uint x = groupID.y;
-	uint y = groupID.z;
+    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
+    uint x = groupID.y;
+    uint y = groupID.z;
-	// need all threads to load channels, thus will do late check against kernel count
-	if (x*SIZE >= X.width) return;
-	if (y*SIZE >= X.height) return;
+    // need all threads to load channels, thus will do late check against kernel count
+    if (x*SIZE >= X.width) return;
+    if (y*SIZE >= X.height) return;
-	uint2 pad = _Pad.xy / _Stride.xy;
+    uint2 pad = _Pad.xy / _Stride.xy;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		for (uint sy = 0; sy < _Stride.y; ++sy)
-		{
-			for (uint sx = 0; sx < _Stride.x; ++sx)
-			{
-				float acc[SIZE*SIZE];
-				[unroll]
-				for (uint q = 0; q < SIZE*SIZE; ++q)
-					acc[q] = B.SafeGet(k);
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        for (uint sy = 0; sy < _Stride.y; ++sy)
+        {
+            for (uint sx = 0; sx < _Stride.x; ++sx)
+            {
+                float acc[SIZE*SIZE];
+                [unroll]
+                for (uint q = 0; q < SIZE*SIZE; ++q)
+                    acc[q] = B.SafeGet(k);
-				for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
-				{
-					for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
-					{
-						uint2 pos[SIZE*SIZE];
-						[unroll]
-						for (uint q = 0; q < SIZE*SIZE; ++q)
-							pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
+                for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
+                {
+                    for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
+                    {
+                        uint2 pos[SIZE*SIZE];
+                        [unroll]
+                        for (uint q = 0; q < SIZE*SIZE; ++q)
+                            pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
-						for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-						{
-							// Cache X
-							uint dc = groupThreadID.x;
-							[unroll]
-							for (q = 0; q < SIZE*SIZE; ++q)
-								X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
-							GroupMemoryBarrierWithGroupSync();
+                        for (uint c = 0; c < X.channels; c += L1CACHESIZE)
+                        {
+                            // Cache X
+                            uint dc = groupThreadID.x;
+                            [unroll]
+                            for (q = 0; q < SIZE*SIZE; ++q)
+                                X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
+                            GroupMemoryBarrierWithGroupSync();
-							// X * K
-							if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-							{
-								//uint kIndex = K.Index(dy, dx, c, k);
-								for (dc = 0; dc < L1CACHESIZE; ++dc)
-								{
-									[unroll]
-									for (q = 0; q < SIZE*SIZE; ++q)
-										acc[q] = fastfma(	X_[q][dc],
-															K.Get(	K.GetKernelHeight() - 1 - dy,
-																	K.GetKernelWidth()  - 1 - dx, c + dc, k),
-															acc[q]);
-									//kIndex += K.channels;
-								}
-							}
-							GroupMemoryBarrierWithGroupSync();
-						}
-					}
-				}
+                            // X * K
+                            if (k < K.channels) // need all threads to load channels, thus late check against kernel count
+                            {
+                                //uint kIndex = K.Index(dy, dx, c, k);
+                                for (dc = 0; dc < L1CACHESIZE; ++dc)
+                                {
+                                    [unroll]
+                                    for (q = 0; q < SIZE*SIZE; ++q)
+                                        acc[q] = fastfma(   X_[q][dc],
+                                                            K.Get(  K.GetKernelHeight() - 1 - dy,
+                                                                    K.GetKernelWidth()  - 1 - dx, c + dc, k),
+                                                            acc[q]);
+                                    //kIndex += K.channels;
+                                }
+                            }
+                            GroupMemoryBarrierWithGroupSync();
+                        }
+                    }
+                }
-				if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-					[unroll]
-					for (q = 0; q < SIZE*SIZE; ++q)
-					{
-						uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
-						uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
-						if (ox < O.width && oy < O.height)
-							O.Set(n, oy, ox, k, acc[q]);
-					}
-			}
-		}
-	}
+                if (k < K.channels) // need all threads to load channels, thus late check against kernel count
+                    [unroll]
+                    for (q = 0; q < SIZE*SIZE; ++q)
+                    {
+                        uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
+                        uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
+                        if (ox < O.width && oy < O.height)
+                            O.Set(n, oy, ox, k, acc[q]);
+                    }
+            }
+        }
+    }
-	#undef X_
+    #undef X_
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
 NUMTHREADS((16,8,1), (16,8,1), (16,4,1))
 void Conv2D_Kmod16_Nmod8_KNY(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(K.channels, O.batch, O.height);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.channels, O.batch, O.height);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	uint k = dispatchThreadID.x;
-	uint n = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint n = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	for (uint x = 0; x < O.width; ++x)
-	{
-		float v = B.Get(k);
-		for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-		{
-			for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-			{
-				uint oy = y * _Stride.y + dy;
-				uint ox = x * _Stride.x + dx;
-				// @TODO: investigate
-				// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-				if (oy < _Pad.y) continue;
-				if (oy - _Pad.w >= X.height) continue;
-				if (ox < _Pad.x) continue;
-				if (ox - _Pad.z >= X.width) continue;
+    for (uint x = 0; x < O.width; ++x)
+    {
+        float v = B.Get(k);
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+        {
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
+                // @TODO: investigate
+                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+                if (oy < _Pad.y) continue;
+                if (oy - _Pad.w >= X.height) continue;
+                if (ox < _Pad.x) continue;
+                if (ox - _Pad.z >= X.width) continue;
-				for (uint c = 0; c < X.channels; ++c)
-				{
-					v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
-				}
-			}
-		}
-		O.Set(n, y, x, k, v);
-	}
+                for (uint c = 0; c < X.channels; ++c)
+                {
+                    v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
+                }
+            }
+        }
+        O.Set(n, y, x, k, v);
+    }
 }

 #undef CTILE
 [numthreads(CTILE, CTILE, 1)]
 void Conv2D_Cache_KCmod32_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv_Xcache
-	#define K_ Conv_Kcache
+    #define X_ Conv_Xcache
+    #define K_ Conv_Kcache
-	uint gx = groupThreadID.x;
-	uint gy = groupThreadID.y;
+    uint gx = groupThreadID.x;
+    uint gy = groupThreadID.y;
-	uint k = CTILE * groupID.x + groupThreadID.x;
-	uint nyx = CTILE * groupID.y + groupThreadID.y;
+    uint k = CTILE * groupID.x + groupThreadID.x;
+    uint nyx = CTILE * groupID.y + groupThreadID.y;
-	uint width = O.width;
-	uint height = O.height;
+    uint width = O.width;
+    uint height = O.height;
-	uint x = nyx % width;
-	uint ny = nyx / width;
-	uint y = ny % height;
-	uint n = ny / height;
-	
-	float b0 = B.Get(k*2+0);
-	float b1 = B.Get(k*2+1);
-	float4 v = float4(b0, b1,
-					  b0, b1);
+    uint x = nyx % width;
+    uint ny = nyx / width;
+    uint y = ny % height;
+    uint n = ny / height;
+    
+    float b0 = B.Get(k*2+0);
+    float b1 = B.Get(k*2+1);
+    float4 v = float4(b0, b1,
+                      b0, b1);
-	for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-	{
-		for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-		{
-			bool mask = true;
-			uint oy = y * _Stride.y + dy;
-			uint ox = x * _Stride.x + dx;
-			// @TODO: investigate
-			// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-			if (oy < _Pad.y) mask = false;
-			if (oy - _Pad.w >= X.height) mask = false;
-			if (ox < _Pad.x) mask = false;
-			if (ox - _Pad.z >= X.width) mask = false;
+    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+    {
+        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+        {
+            bool mask = true;
+            uint oy = y * _Stride.y + dy;
+            uint ox = x * _Stride.x + dx;
+            // @TODO: investigate
+            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+            if (oy < _Pad.y) mask = false;
+            if (oy - _Pad.w >= X.height) mask = false;
+            if (ox < _Pad.x) mask = false;
+            if (ox - _Pad.z >= X.width) mask = false;
-			for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-			{
-				float x0 = 0;
-				float x1 = 0;
-				float x2 = 0;
-				float x3 = 0;
-				
-				if (mask)
-				{
-					x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
-					x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
-					x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
-					x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
-				}
+            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
+            {
+                float x0 = 0;
+                float x1 = 0;
+                float x2 = 0;
+                float x3 = 0;
+                
+                if (mask)
+                {
+                    x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
+                    x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
+                    x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
+                    x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
+                }
-				float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
-				float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
-				float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
-				float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
+                float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
+                float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
+                float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
+                float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
-				//X_[gy][gx] = float4(x0, x1,
-				//					x2, x3);
-				//K_[gy][gx] = float4(k0, k1,
-				//					k2, k3);
-				X_[0][gy][gx] = x0;
-				X_[1][gy][gx] = x1;
-				X_[2][gy][gx] = x2;
-				X_[3][gy][gx] = x3;
+                //X_[gy][gx] = float4(x0, x1,
+                //                    x2, x3);
+                //K_[gy][gx] = float4(k0, k1,
+                //                    k2, k3);
+                X_[0][gy][gx] = x0;
+                X_[1][gy][gx] = x1;
+                X_[2][gy][gx] = x2;
+                X_[3][gy][gx] = x3;
-				K_[0][gy][gx] = k0;
-				K_[1][gy][gx] = k1;
-				K_[2][gy][gx] = k2;
-				K_[3][gy][gx] = k3;
+                K_[0][gy][gx] = k0;
+                K_[1][gy][gx] = k1;
+                K_[2][gy][gx] = k2;
+                K_[3][gy][gx] = k3;
-				GroupMemoryBarrierWithGroupSync();
+                GroupMemoryBarrierWithGroupSync();
-				[unroll]
-				for (uint i = 0; i < CTILE; ++i)
-				{
-					float4 x = //X_[gy][i];
-						float4(	X_[0][gy][i],
-								X_[1][gy][i],
-								X_[2][gy][i],
-								X_[3][gy][i]);
-					float4 k = //K_[i][gx];
-						float4(	K_[0][i][gx],
-								K_[1][i][gx],
-								K_[2][i][gx],
-								K_[3][i][gx]);
-					
-					v.x = mad(k.x, x.x, v.x);
-					v.x = mad(k.z, x.y, v.x);
-					
-					v.y = mad(k.y, x.x, v.y);
-					v.y = mad(k.w, x.y, v.y);
-					
-					v.z = mad(k.x, x.z, v.z);
-					v.z = mad(k.z, x.w, v.z);
-					
-					v.w = mad(k.y, x.z, v.w);
-					v.w = mad(k.w, x.w, v.w);
+                [unroll]
+                for (uint i = 0; i < CTILE; ++i)
+                {
+                    float4 x = //X_[gy][i];
+                        float4(    X_[0][gy][i],
+                                X_[1][gy][i],
+                                X_[2][gy][i],
+                                X_[3][gy][i]);
+                    float4 k = //K_[i][gx];
+                        float4(    K_[0][i][gx],
+                                K_[1][i][gx],
+                                K_[2][i][gx],
+                                K_[3][i][gx]);
+                    
+                    v.x = mad(k.x, x.x, v.x);
+                    v.x = mad(k.z, x.y, v.x);
+                    
+                    v.y = mad(k.y, x.x, v.y);
+                    v.y = mad(k.w, x.y, v.y);
+                    
+                    v.z = mad(k.x, x.z, v.z);
+                    v.z = mad(k.z, x.w, v.z);
+                    
+                    v.w = mad(k.y, x.z, v.w);
+                    v.w = mad(k.w, x.w, v.w);
-					//v.x += k.x*x.x + k.z*x.y;
-					//v.y += k.y*x.x + k.w*x.y;
-					//v.z += k.x*x.z + k.z*x.w;
-					//v.w += k.y*x.z + k.w*x.w;
-				}
+                    //v.x += k.x*x.x + k.z*x.y;
+                    //v.y += k.y*x.x + k.w*x.y;
+                    //v.z += k.x*x.z + k.z*x.w;
+                    //v.w += k.y*x.z + k.w*x.w;
+                }
-				GroupMemoryBarrierWithGroupSync();
-			}
-		}
-	}
+                GroupMemoryBarrierWithGroupSync();
+            }
+        }
+    }
-	O.Set(n*2+0, y, x, k*2+0, v.x);
-	O.Set(n*2+0, y, x, k*2+1, v.y);
-	O.Set(n*2+1, y, x, k*2+0, v.z);
-	O.Set(n*2+1, y, x, k*2+1, v.w);
-	
-	#undef X_
-	#undef K_
+    O.Set(n*2+0, y, x, k*2+0, v.x);
+    O.Set(n*2+0, y, x, k*2+1, v.y);
+    O.Set(n*2+1, y, x, k*2+0, v.z);
+    O.Set(n*2+1, y, x, k*2+1, v.w);
+    
+    #undef X_
+    #undef K_
 }

 #undef CTILE
 [numthreads(CTILE, CTILE, 1)]
 void Conv2D_Cache_KCmod32_KNyxDiv2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv_Xcache2
-	#define K_ Conv_Kcache2
+    #define X_ Conv_Xcache2
+    #define K_ Conv_Kcache2
-	uint gx = groupThreadID.x;
-	uint gy = groupThreadID.y;
+    uint gx = groupThreadID.x;
+    uint gy = groupThreadID.y;
-	uint k = CTILE * groupID.x + groupThreadID.x;
-	uint nyx = CTILE * groupID.y + groupThreadID.y;
+    uint k = CTILE * groupID.x + groupThreadID.x;
+    uint nyx = CTILE * groupID.y + groupThreadID.y;
-	uint width = O.width / 2;
-	uint height = O.height;
+    uint width = O.width / 2;
+    uint height = O.height;
-	uint x = nyx % width;
-	uint ny = nyx / width;
-	uint y = ny % height;
-	uint n = ny / height;
-	
-	float b0 = B.Get(k*2+0);
-	float b1 = B.Get(k*2+1);
-	float4 v = float4(b0, b1,
-					  b0, b1);
+    uint x = nyx % width;
+    uint ny = nyx / width;
+    uint y = ny % height;
+    uint n = ny / height;
+    
+    float b0 = B.Get(k*2+0);
+    float b1 = B.Get(k*2+1);
+    float4 v = float4(b0, b1,
+                      b0, b1);
-	bool mask = n < O.batch;
+    bool mask = n < O.batch;
-	for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-	{
-		for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-		{
-			// @TODO: investigate
-			// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-			bool maskY = mask;
-			uint oy = y * _Stride.y + dy;
-			if (oy < _Pad.y) maskY = false;
-			if (oy - _Pad.w >= X.height) maskY = false;
+    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+    {
+        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+        {
+            // @TODO: investigate
+            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+            bool maskY = mask;
+            uint oy = y * _Stride.y + dy;
+            if (oy < _Pad.y) maskY = false;
+            if (oy - _Pad.w >= X.height) maskY = false;
-			bool maskL = maskY;
-			uint oxL = (x*2+0) * _Stride.x + dx;
-			if (oxL < _Pad.x) maskL = false;
-			if (oxL - _Pad.z >= X.width) maskL = false;
+            bool maskL = maskY;
+            uint oxL = (x*2+0) * _Stride.x + dx;
+            if (oxL < _Pad.x) maskL = false;
+            if (oxL - _Pad.z >= X.width) maskL = false;
-			bool maskR = maskY;
-			uint oxR = (x*2+1) * _Stride.x + dx;
-			if (oxR < _Pad.x) maskR = false;
-			if (oxR - _Pad.z >= X.width) maskR = false;
+            bool maskR = maskY;
+            uint oxR = (x*2+1) * _Stride.x + dx;
+            if (oxR < _Pad.x) maskR = false;
+            if (oxR - _Pad.z >= X.width) maskR = false;
-			for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-			{
-				if (maskL)
-				{
-					X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
-					X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
-				}
-				else
-				{
-					X_[0][gy][gx] = X_[1][gy][gx] = 0;
-				}
+            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
+            {
+                if (maskL)
+                {
+                    X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
+                    X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
+                }
+                else
+                {
+                    X_[0][gy][gx] = X_[1][gy][gx] = 0;
+                }
-				if (maskR)
-				{
-					X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
-					X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
-				}
-				else
-				{
-					X_[2][gy][gx] = X_[3][gy][gx] = 0;
-				}
+                if (maskR)
+                {
+                    X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
+                    X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
+                }
+                else
+                {
+                    X_[2][gy][gx] = X_[3][gy][gx] = 0;
+                }
-				K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
-				K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
-				K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
-				K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
+                K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
+                K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
+                K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
+                K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
-				GroupMemoryBarrierWithGroupSync();
+                GroupMemoryBarrierWithGroupSync();
-				[unroll]
-				for (uint i = 0; i < CTILE; ++i)
-				{
-					float4 x =
-						float4(	X_[0][gy][i],
-								X_[1][gy][i],
-								X_[2][gy][i],
-								X_[3][gy][i]);
-					float4 k =
-						float4(	K_[0][i][gx],
-								K_[1][i][gx],
-								K_[2][i][gx],
-								K_[3][i][gx]);
-					
-					v.x = mad(k.x, x.x, v.x);
-					v.x = mad(k.z, x.y, v.x);
-					
-					v.y = mad(k.y, x.x, v.y);
-					v.y = mad(k.w, x.y, v.y);
-					
-					v.z = mad(k.x, x.z, v.z);
-					v.z = mad(k.z, x.w, v.z);
-					
-					v.w = mad(k.y, x.z, v.w);
-					v.w = mad(k.w, x.w, v.w);
-				}
+                [unroll]
+                for (uint i = 0; i < CTILE; ++i)
+                {
+                    float4 x =
+                        float4(    X_[0][gy][i],
+                                X_[1][gy][i],
+                                X_[2][gy][i],
+                                X_[3][gy][i]);
+                    float4 k =
+                        float4(    K_[0][i][gx],
+                                K_[1][i][gx],
+                                K_[2][i][gx],
+                                K_[3][i][gx]);
+                    
+                    v.x = mad(k.x, x.x, v.x);
+                    v.x = mad(k.z, x.y, v.x);
+                    
+                    v.y = mad(k.y, x.x, v.y);
+                    v.y = mad(k.w, x.y, v.y);
+                    
+                    v.z = mad(k.x, x.z, v.z);
+                    v.z = mad(k.z, x.w, v.z);
+                    
+                    v.w = mad(k.y, x.z, v.w);
+                    v.w = mad(k.w, x.w, v.w);
+                }
-				GroupMemoryBarrierWithGroupSync();
-			}
-		}
-	}
+                GroupMemoryBarrierWithGroupSync();
+            }
+        }
+    }
-	O.Set(n, y, x*2+0, k*2+0, v.x);
-	O.Set(n, y, x*2+0, k*2+1, v.y);
-	if (mask && x*2+1 < O.width)
-	{
-		O.Set(n, y, x*2+1, k*2+0, v.z);
-		O.Set(n, y, x*2+1, k*2+1, v.w);
-	}
+    O.Set(n, y, x*2+0, k*2+0, v.x);
+    O.Set(n, y, x*2+0, k*2+1, v.y);
+    if (mask && x*2+1 < O.width)
+    {
+        O.Set(n, y, x*2+1, k*2+0, v.z);
+        O.Set(n, y, x*2+1, k*2+1, v.w);
+    }
-	#undef X_
-	#undef K_
+    #undef X_
+    #undef K_
 }


 [numthreads(CTILE, CTILE, 1)]
 void Conv2D_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv_XcacheR
-	#define K_ Conv_KcacheR
+    #define X_ Conv_XcacheR
+    #define K_ Conv_KcacheR
-	uint gx = groupThreadID.x;
-	uint gy = groupThreadID.y;
+    uint gx = groupThreadID.x;
+    uint gy = groupThreadID.y;
-	uint k = CTILE * groupID.x + groupThreadID.x;
-	uint nyx = CTILE * groupID.y + groupThreadID.y;
+    uint k = CTILE * groupID.x + groupThreadID.x;
+    uint nyx = CTILE * groupID.y + groupThreadID.y;
-	uint x = nyx % O.width;
-	uint ny = nyx / O.width;
-	uint y = ny % O.height;
-	uint n = ny / O.height;
+    uint x = nyx % O.width;
+    uint ny = nyx / O.width;
+    uint y = ny % O.height;
+    uint n = ny / O.height;
-	float v[RTILE][RTILE];
-	for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
-	{
-		float b = B.Get(k*RTILE+xxxx);
-		for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
-			v[yyyy][xxxx] = b;
-	}
+    float v[RTILE][RTILE];
+    for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
+    {
+        float b = B.Get(k*RTILE+xxxx);
+        for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
+            v[yyyy][xxxx] = b;
+    }
-	for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-	{
-		for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-		{
-			bool mask = true;
-			uint oy = y * _Stride.y + dy;
-			uint ox = x * _Stride.x + dx;
-			// @TODO: investigate
-			// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-			if (oy < _Pad.y) mask = false;
-			if (oy - _Pad.w >= X.height) mask = false;
-			if (ox < _Pad.x) mask = false;
-			if (ox - _Pad.z >= X.width) mask = false;
+    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+    {
+        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+        {
+            bool mask = true;
+            uint oy = y * _Stride.y + dy;
+            uint ox = x * _Stride.x + dx;
+            // @TODO: investigate
+            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+            if (oy < _Pad.y) mask = false;
+            if (oy - _Pad.w >= X.height) mask = false;
+            if (ox < _Pad.x) mask = false;
+            if (ox - _Pad.z >= X.width) mask = false;
-			for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
-			{				
-				for (uint yy = 0; yy < RTILE; ++yy)
-					for (uint xx = 0; xx < RTILE; ++xx)
-					{
-						if (mask)
-							X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
-						else
-							X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
-						K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
-					}
+            for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
+            {                
+                for (uint yy = 0; yy < RTILE; ++yy)
+                    for (uint xx = 0; xx < RTILE; ++xx)
+                    {
+                        if (mask)
+                            X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
+                        else
+                            X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
+                        K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
+                    }
-				GroupMemoryBarrierWithGroupSync();
+                GroupMemoryBarrierWithGroupSync();
-				for (uint ii = 0; ii < CTILE; ++ii)
-				{
-					float x[RTILE][RTILE];
-					float k[RTILE][RTILE];
+                for (uint ii = 0; ii < CTILE; ++ii)
+                {
+                    float x[RTILE][RTILE];
+                    float k[RTILE][RTILE];
-					[unroll]
-					for (uint yy = 0; yy < RTILE; ++yy)
-					{
-						[unroll]
-						for (uint xx = 0; xx < RTILE; ++xx)
-						{
-							x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
-							k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
-						}
-					}
+                    [unroll]
+                    for (uint yy = 0; yy < RTILE; ++yy)
+                    {
+                        [unroll]
+                        for (uint xx = 0; xx < RTILE; ++xx)
+                        {
+                            x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
+                            k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
+                        }
+                    }
-					[unroll]
-					for (uint yyy = 0; yyy < RTILE; ++yyy)
-					{
-						[unroll]
-						for (uint xxx = 0; xxx < RTILE; ++xxx)
-						{
-							[unroll]
-							for (uint i = 0; i < RTILE; ++i)
-							{
-								v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
-							}
-						}
-					}
-				}
+                    [unroll]
+                    for (uint yyy = 0; yyy < RTILE; ++yyy)
+                    {
+                        [unroll]
+                        for (uint xxx = 0; xxx < RTILE; ++xxx)
+                        {
+                            [unroll]
+                            for (uint i = 0; i < RTILE; ++i)
+                            {
+                                v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
+                            }
+                        }
+                    }
+                }
-				GroupMemoryBarrierWithGroupSync();
-			}
-		}
-	}
+                GroupMemoryBarrierWithGroupSync();
+            }
+        }
+    }
-	for (uint yy = 0; yy < RTILE; ++yy)
-		for (uint xx = 0; xx < RTILE; ++xx)
-			O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
-	
-	#undef X_
-	#undef K_
+    for (uint yy = 0; yy < RTILE; ++yy)
+        for (uint xx = 0; xx < RTILE; ++xx)
+            O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
+    
+    #undef X_
+    #undef K_
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
 #pragma kernel Dense_L1Cached64
 #pragma kernel DenseTiled16x16
-//#pragma kernel DenseTiled32x32
-//#pragma kernel DenseTiled64x64
+#pragma kernel DenseTiled32x32
+#pragma kernel DenseTiled64x64

 #include "Tensor.cginc"

 [numthreads(CACHESIZE, 1, 1)]
 void Dense_L1Cached64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	#define X_ Dense_L1Cached64_X
+    #define X_ Dense_L1Cached64_X
-	uint x = CACHESIZE * groupID.x + groupThreadID.x;
-	uint y = groupID.y;
+    uint x = CACHESIZE * groupID.x + groupThreadID.x;
+    uint y = groupID.y;
-	uint wIndex = W.Index(0, x);
+    uint wIndex = W.Index(0, x);
-	float acc = B.Get(x);
-	// loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
-	for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
-	{
-		// Cache X
-		// coalescent reads
-		X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
-		GroupMemoryBarrierWithGroupSync();
+    float acc = B.Get(x);
+    // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
+    for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
+    {
+        // Cache X
+        // coalescent reads
+        X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
+        GroupMemoryBarrierWithGroupSync();
-		// X * W
-		if (i + CACHESIZE <= X.GetFlatWidth())
-		{
-			[unroll]
-			for (uint di = 0; di < CACHESIZE; ++di)
-			{
-				acc = fastfma(X_[di], W.data[wIndex], acc);
-				wIndex += W.GetFlatWidth();
-			}
-		}
-		else
-		{
-			// handle remainder of the line < CACHESIZE
-			for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
-			{
-				acc = fastfma(X_[di], W.data[wIndex], acc);
-				wIndex += W.GetFlatWidth();
-			}
-		}
+        // X * W
+        if (i + CACHESIZE <= X.GetFlatWidth())
+        {
+            [unroll]
+            for (uint di = 0; di < CACHESIZE; ++di)
+            {
+                acc = fastfma(X_[di], W.data[wIndex], acc);
+                wIndex += W.GetFlatWidth();
+            }
+        }
+        else
+        {
+            // handle remainder of the line < CACHESIZE
+            for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
+            {
+                acc = fastfma(X_[di], W.data[wIndex], acc);
+                wIndex += W.GetFlatWidth();
+            }
+        }
-		GroupMemoryBarrierWithGroupSync();
-	}
+        GroupMemoryBarrierWithGroupSync();
+    }
-	// needed all threads to load matrix line, x might be out of the bounds for writing
-	if (x < O.GetFlatWidth())
-		O.Set(y, x, acc);
+    // needed all threads to load matrix line, x might be out of the bounds for writing
+    if (x < O.GetFlatWidth())
+        O.Set(y, x, acc);
-	#undef X_
+    #undef X_
 }


 [numthreads(TILE_WIDTH,TILE_WIDTH,1)]
 void DenseTiled16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	#define X_ DenseTiled_Xcache
-	#define W_ DenseTiled_Wcache
+    #define X_ DenseTiled_Xcache
+    #define W_ DenseTiled_Wcache
-	uint tx = groupThreadID.x;
-	uint ty = groupThreadID.y;
-	uint x = groupID.x*TILE_WIDTH + tx;
-	uint y = groupID.y*TILE_WIDTH + ty;
+    uint tx = groupThreadID.x;
+    uint ty = groupThreadID.y;
+    uint x = groupID.x*TILE_WIDTH + tx;
+    uint y = groupID.y*TILE_WIDTH + ty;
-	bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
+    bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
-	float v = B.Get(x);
-	for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
-	{
-		if (mask)
-		{
-			X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
-			W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
-		}
-		else
-		{
-			X_[ty][tx] = 0;
-			W_[ty][tx] = 0;
-		}
+    float v = B.Get(x);
+    for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
+    {
+        if (mask)
+        {
+            X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
+            W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
+        }
+        else
+        {
+            X_[ty][tx] = 0;
+            W_[ty][tx] = 0;
+        }
-		GroupMemoryBarrierWithGroupSync();
+        GroupMemoryBarrierWithGroupSync();
-		[unroll]
-		for (uint i = 0; i < TILE_WIDTH; ++i)
-		{
-			v = fastfma(X_[ty][i], W_[i][tx], v);
-		}
+        [unroll]
+        for (uint i = 0; i < TILE_WIDTH; ++i)
+        {
+            v = fastfma(X_[ty][i], W_[i][tx], v);
+        }
-		GroupMemoryBarrierWithGroupSync();
-	}
-	
-	if (mask)
-		O.Set(y, x, v);
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    if (mask)
+        O.Set(y, x, v);
-	#undef X_
-	#undef W_
+    #undef X_
+    #undef W_
 }

 #undef TILE_WIDTH
 [numthreads(TILE_WIDTH,TILE_WIDTH,1)]
 void DenseTiled32x32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	#define X_ DenseTiled_Xcache32
-	#define W_ DenseTiled_Wcache32
+    #define X_ DenseTiled_Xcache32
+    #define W_ DenseTiled_Wcache32
-	uint tx = groupThreadID.x;
-	uint ty = groupThreadID.y;
-	uint x = groupID.x*TILE_WIDTH + tx;
-	uint y = groupID.y*TILE_WIDTH + ty;
+    uint tx = groupThreadID.x;
+    uint ty = groupThreadID.y;
+    uint x = groupID.x*TILE_WIDTH + tx;
+    uint y = groupID.y*TILE_WIDTH + ty;
-	float b0 = B.Get(x*2+0);
-	float b1 = B.Get(x*2+1);
-	float4 v = float4(b0, b1,
-					  b0, b1);
+    float b0 = B.Get(x*2+0);
+    float b1 = B.Get(x*2+1);
+    float4 v = float4(b0, b1,
+                      b0, b1);
-	for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
-	{
-		float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
-		float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
-		float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
-		float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
+    for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
+    {
+        float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
+        float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
+        float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
+        float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
-		float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
-		float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
-		float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
-		float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
+        float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
+        float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
+        float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
+        float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
-		++m;
+        ++m;
-		X_[0][ty][tx] = x0;
-		X_[1][ty][tx] = x1;
-		X_[2][ty][tx] = x2;
-		X_[3][ty][tx] = x3;
+        X_[0][ty][tx] = x0;
+        X_[1][ty][tx] = x1;
+        X_[2][ty][tx] = x2;
+        X_[3][ty][tx] = x3;
-		W_[0][ty][tx] = w0;
-		W_[1][ty][tx] = w1;
-		W_[2][ty][tx] = w2;
-		W_[3][ty][tx] = w3;
+        W_[0][ty][tx] = w0;
+        W_[1][ty][tx] = w1;
+        W_[2][ty][tx] = w2;
+        W_[3][ty][tx] = w3;
-		GroupMemoryBarrierWithGroupSync();
+        GroupMemoryBarrierWithGroupSync();
-		[unroll]
-		for (uint i = 0; i < TILE_WIDTH; ++i)
-		{
-			float4 x =
-				float4(	X_[0][ty][i],
-						X_[1][ty][i],
-						X_[2][ty][i],
-						X_[3][ty][i]);
-			float4 w =
-				float4(	W_[0][i][tx],
-						W_[1][i][tx],
-						W_[2][i][tx],
-						W_[3][i][tx]);
-					
-			v.x = fastfma(w.x, x.x, v.x);
-			v.y = fastfma(w.y, x.x, v.y);
-			v.z = fastfma(w.x, x.z, v.z);
-			v.w = fastfma(w.y, x.z, v.w);
+        [unroll]
+        for (uint i = 0; i < TILE_WIDTH; ++i)
+        {
+            float4 x =
+                float4(    X_[0][ty][i],
+                        X_[1][ty][i],
+                        X_[2][ty][i],
+                        X_[3][ty][i]);
+            float4 w =
+                float4(    W_[0][i][tx],
+                        W_[1][i][tx],
+                        W_[2][i][tx],
+                        W_[3][i][tx]);
+                    
+            v.x = fastfma(w.x, x.x, v.x);
+            v.y = fastfma(w.y, x.x, v.y);
+            v.z = fastfma(w.x, x.z, v.z);
+            v.w = fastfma(w.y, x.z, v.w);
-			v.x = fastfma(w.z, x.y, v.x);
-			v.y = fastfma(w.w, x.y, v.y);
-			v.z = fastfma(w.z, x.w, v.z);
-			v.w = fastfma(w.w, x.w, v.w);
-		}
-		
-		GroupMemoryBarrierWithGroupSync();
-	}
-	
-	O.Set(y*2+0, x*2+0, v.x);
-	O.Set(y*2+0, x*2+1, v.y);
-	O.Set(y*2+1, x*2+0, v.z);
-	O.Set(y*2+1, x*2+1, v.w);
+            v.x = fastfma(w.z, x.y, v.x);
+            v.y = fastfma(w.w, x.y, v.y);
+            v.z = fastfma(w.z, x.w, v.z);
+            v.w = fastfma(w.w, x.w, v.w);
+        }
+        
+        GroupMemoryBarrierWithGroupSync();
+    }
+    
+    O.Set(y*2+0, x*2+0, v.x);
+    O.Set(y*2+0, x*2+1, v.y);
+    O.Set(y*2+1, x*2+0, v.z);
+    O.Set(y*2+1, x*2+1, v.w);
-	#undef X_
-	#undef W_
+    #undef X_
+    #undef W_
 }

 #undef TILE_WIDTH
 [numthreads(TILE_WIDTH,TILE_WIDTH,1)]
 void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	#define X_ DenseTiled_Xcache64
-	#define W_ DenseTiled_Wcache64
+    #define X_ DenseTiled_Xcache64
+    #define W_ DenseTiled_Wcache64
-	uint tx = groupThreadID.x;
-	uint ty = groupThreadID.y;
-	uint x = groupID.x*TILE_WIDTH + tx;
-	uint y = groupID.y*TILE_WIDTH + ty;
+    uint tx = groupThreadID.x;
+    uint ty = groupThreadID.y;
+    uint x = groupID.x*TILE_WIDTH + tx;
+    uint y = groupID.y*TILE_WIDTH + ty;
-	float b0 = B.Get(x*4+0);
-	float b1 = B.Get(x*4+1);
-	float b2 = B.Get(x*4+2);
-	float b3 = B.Get(x*4+3);
-	
-	float4 v0, v1, v2, v3;
-	v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
+    float b0 = B.Get(x*4+0);
+    float b1 = B.Get(x*4+1);
+    float b2 = B.Get(x*4+2);
+    float b3 = B.Get(x*4+3);
+    
+    float4 v0, v1, v2, v3;
+    v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
-	for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m) 
-	{
-		for (uint yy = 0; yy < 4; ++yy)
-			for (uint xx = 0; xx < 4; ++xx)
-			{
-				X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
-				W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
-			}
-		
-		GroupMemoryBarrierWithGroupSync();
+    for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m) 
+    {
+        for (uint yy = 0; yy < 4; ++yy)
+            for (uint xx = 0; xx < 4; ++xx)
+            {
+                X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
+                W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
+            }
+        
+        GroupMemoryBarrierWithGroupSync();
-		for (uint i = 0; i < TILE_WIDTH; ++i)
-		{
-			[unroll]
-			for (uint q = 0; q < 4; ++q)
-			{
-				float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
-				float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
-				float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
-				float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
-				
-				float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
-				float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
-				float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
-				float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
+        for (uint i = 0; i < TILE_WIDTH; ++i)
+        {
+            [unroll]
+            for (uint q = 0; q < 4; ++q)
+            {
+                float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
+                float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
+                float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
+                float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
+                
+                float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
+                float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
+                float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
+                float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
-				v0.x = fastfma(x0, w0, v0.x); //--
-				v1.x = fastfma(x1, w0, v1.x);
-				v2.x = fastfma(x2, w0, v2.x);
-				v3.x = fastfma(x3, w0, v3.x);
-				v0.y = fastfma(x0, w1, v0.y); //--
-				v1.y = fastfma(x1, w1, v1.y);
-				v2.y = fastfma(x2, w1, v2.y);
-				v3.y = fastfma(x3, w1, v3.y);
-				v0.z = fastfma(x0, w2, v0.z); //--
-				v1.z = fastfma(x1, w2, v1.z);
-				v2.z = fastfma(x2, w2, v2.z);
-				v3.z = fastfma(x3, w2, v3.z);
-				v0.w = fastfma(x0, w3, v0.w); //--
-				v1.w = fastfma(x1, w3, v1.w);
-				v2.w = fastfma(x2, w3, v2.w);
-				v3.w = fastfma(x3, w3, v3.w);
-			}
+                v0.x = fastfma(x0, w0, v0.x); //--
+                v1.x = fastfma(x1, w0, v1.x);
+                v2.x = fastfma(x2, w0, v2.x);
+                v3.x = fastfma(x3, w0, v3.x);
+                v0.y = fastfma(x0, w1, v0.y); //--
+                v1.y = fastfma(x1, w1, v1.y);
+                v2.y = fastfma(x2, w1, v2.y);
+                v3.y = fastfma(x3, w1, v3.y);
+                v0.z = fastfma(x0, w2, v0.z); //--
+                v1.z = fastfma(x1, w2, v1.z);
+                v2.z = fastfma(x2, w2, v2.z);
+                v3.z = fastfma(x3, w2, v3.z);
+                v0.w = fastfma(x0, w3, v0.w); //--
+                v1.w = fastfma(x1, w3, v1.w);
+                v2.w = fastfma(x2, w3, v2.w);
+                v3.w = fastfma(x3, w3, v3.w);
+            }
-			GroupMemoryBarrierWithGroupSync();
-		}
-	}
+            GroupMemoryBarrierWithGroupSync();
+        }
+    }
-	O.Set(y*4+0, x*4+0, v0.x);
-	O.Set(y*4+0, x*4+1, v0.y);
-	O.Set(y*4+0, x*4+2, v0.z);
-	O.Set(y*4+0, x*4+3, v0.w);
+    O.Set(y*4+0, x*4+0, v0.x);
+    O.Set(y*4+0, x*4+1, v0.y);
+    O.Set(y*4+0, x*4+2, v0.z);
+    O.Set(y*4+0, x*4+3, v0.w);
-	O.Set(y*4+1, x*4+0, v1.x);
-	O.Set(y*4+1, x*4+1, v1.y);
-	O.Set(y*4+1, x*4+2, v1.z);
-	O.Set(y*4+1, x*4+3, v1.w);
+    O.Set(y*4+1, x*4+0, v1.x);
+    O.Set(y*4+1, x*4+1, v1.y);
+    O.Set(y*4+1, x*4+2, v1.z);
+    O.Set(y*4+1, x*4+3, v1.w);
-	O.Set(y*4+2, x*4+0, v2.x);
-	O.Set(y*4+2, x*4+1, v2.y);
-	O.Set(y*4+2, x*4+2, v2.z);
-	O.Set(y*4+2, x*4+3, v2.w);
+    O.Set(y*4+2, x*4+0, v2.x);
+    O.Set(y*4+2, x*4+1, v2.y);
+    O.Set(y*4+2, x*4+2, v2.z);
+    O.Set(y*4+2, x*4+3, v2.w);
-	O.Set(y*4+3, x*4+0, v3.x);
-	O.Set(y*4+3, x*4+1, v3.y);
-	O.Set(y*4+3, x*4+2, v3.z);
-	O.Set(y*4+3, x*4+3, v3.w);
-			  
-	#undef X_
-	#undef W_
+    O.Set(y*4+3, x*4+0, v3.x);
+    O.Set(y*4+3, x*4+1, v3.y);
+    O.Set(y*4+3, x*4+2, v3.z);
+    O.Set(y*4+3, x*4+3, v3.w);
+              
+    #undef X_
+    #undef W_
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute

 float2 Unpack(SharedTensor t, uint y, uint x)
 {
-	uint v = asuint(t.data[t.Index(y, x) >> 1]);
-	// TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
-	// using custom conversion function for now
-	//return float2(f16tof32(v), f16tof32(v>>16));
-	return float2(f16tof32_(v), f16tof32_(v>>16));
+    uint v = asuint(t.data[t.Index(y, x) >> 1]);
+    // TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
+    // using custom conversion function for now
+    //return float2(f16tof32(v), f16tof32(v>>16));
+    return float2(f16tof32_(v), f16tof32_(v>>16));
 }

 // NOTE: usually this path is used for <16 batches
    DISPATCH_ARGS(O.flatWidth/2, O.flatHeight, 1);
    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);

-	uint x = dispatchThreadID.x;
-	uint y = dispatchThreadID.y;
+    uint x = dispatchThreadID.x;
+    uint y = dispatchThreadID.y;
-	float2 acc = Unpack(B, 0, x*2);
-	for (uint i = 0; i < X.width; ++i)
-	{
-		float2 w = Unpack(W, i, x*2);
-		acc += X.Get(y, i) * w;
-	}
+    float2 acc = Unpack(B, 0, x*2);
+    for (uint i = 0; i < X.width; ++i)
+    {
+        float2 w = Unpack(W, i, x*2);
+        acc += X.Get(y, i) * w;
+    }
-	O.Set(y, x*2+0, acc[0]);
-	O.Set(y, x*2+1, acc[1]);
+    O.Set(y, x*2+0, acc[0]);
+    O.Set(y, x*2+1, acc[1]);
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
 [numthreads(THREAD_COUNT, 1, 1)]
 void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	// @TODO: DISPATCH_ARGS(...)
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    // @TODO: DISPATCH_ARGS(...)
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	#define X_ DenseTiled_XcacheR
-	#define W_ DenseTiled_WcacheR
+    #define X_ DenseTiled_XcacheR
+    #define W_ DenseTiled_WcacheR
-	uint id = groupThreadID.x;
-	uint bx = groupID.x;
-	uint by = groupID.y;
+    uint id = groupThreadID.x;
+    uint bx = groupID.x;
+    uint by = groupID.y;
-	uint bbx = id % BLOCK_WIDTH;
-	uint bby = id / BLOCK_WIDTH;
+    uint bbx = id % BLOCK_WIDTH;
+    uint bby = id / BLOCK_WIDTH;
-	float v[BLOCK_WIDTH][BLOCK_WIDTH];
-	for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-		for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-		{
-			float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
-			v[yy][xx] = bias;
-		}
+    float v[BLOCK_WIDTH][BLOCK_WIDTH];
+    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
+        for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
+        {
+            float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
+            v[yy][xx] = bias;
+        }
-	for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
-	{
-		for (uint q = 0; q < LOAD_DEPTH; ++q)
-		{
-			X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
-			W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
-		}
+    for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
+    {
+        for (uint q = 0; q < LOAD_DEPTH; ++q)
+        {
+            X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
+            W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
+        }
-		GroupMemoryBarrierWithGroupSync();
+        GroupMemoryBarrierWithGroupSync();
-		for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-			[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-				[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-				{
-					v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
-				}
+        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
+            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
+                [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
+                {
+                    v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
+                }
-		GroupMemoryBarrierWithGroupSync();
-	}
+        GroupMemoryBarrierWithGroupSync();
+    }
-	for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-		for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-			O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
+    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
+        for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
+            O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
-	#undef X_
-	#undef W_
+    #undef X_
+    #undef W_
 }


 [numthreads(THREAD_COUNT, 1, 1)]
 void Conv2D_Kernel3x3_64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
 {
-	// @TODO: DISPATCH_ARGS(...)
-	TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
+    // @TODO: DISPATCH_ARGS(...)
+    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-	#define X_ Conv_XcacheR
-	#define K_ Conv_KcacheR
+    #define X_ Conv_XcacheR
+    #define K_ Conv_KcacheR
-	uint id = groupThreadID.x;
-	uint bx = groupID.x;
-	uint by = groupID.y;
+    uint id = groupThreadID.x;
+    uint bx = groupID.x;
+    uint by = groupID.y;
-	uint bbx = id % BLOCK_WIDTH;
-	uint bby = id / BLOCK_WIDTH;
+    uint bbx = id % BLOCK_WIDTH;
+    uint bby = id / BLOCK_WIDTH;
-	uint width = O.width;
-	uint height = O.height;
+    uint width = O.width;
+    uint height = O.height;
-	// ASSERT(LOAD_WIDTH == THREAD_COUNT)
-	uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
-	uint loadX = loadNYX % width;
-	uint loadNY = loadNYX / width;
-	uint loadY = loadNY % height;
-	uint loadN = loadNY / height;
+    // ASSERT(LOAD_WIDTH == THREAD_COUNT)
+    uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
+    uint loadX = loadNYX % width;
+    uint loadNY = loadNYX / width;
+    uint loadY = loadNY % height;
+    uint loadN = loadNY / height;
-	// @TODO: validate that _Stride works, added the following 2 lines without testing
-	loadX *= _Stride.x;
-	loadY *= _Stride.y;
+    // @TODO: validate that _Stride works, added the following 2 lines without testing
+    loadX *= _Stride.x;
+    loadY *= _Stride.y;
-	float v[BLOCK_WIDTH][BLOCK_WIDTH];
-	[unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-		[unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-		{
-			float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
-			v[yy][xx] = bias;
-		}
+    float v[BLOCK_WIDTH][BLOCK_WIDTH];
+    [unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
+        [unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
+        {
+            float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
+            v[yy][xx] = bias;
+        }
-	for (uint dy = 0; dy < 3; ++dy)
-	{
-		bool mask = true;
+    for (uint dy = 0; dy < 3; ++dy)
+    {
+        bool mask = true;
-		if (loadY+dy < _Pad.y) mask = false;
-		if (loadY+dy - _Pad.w >= X.height) mask = false;
+        if (loadY+dy < _Pad.y) mask = false;
+        if (loadY+dy - _Pad.w >= X.height) mask = false;
-		for (uint dx = 0; dx < 3; ++dx)
-		{
-			if (loadX+dx < _Pad.x) mask = false;
-			if (loadX+dx - _Pad.z >= X.width) mask = false;
+        for (uint dx = 0; dx < 3; ++dx)
+        {
+            if (loadX+dx < _Pad.x) mask = false;
+            if (loadX+dx - _Pad.z >= X.width) mask = false;
-			for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
-			{
-				for (uint q = 0; q < LOAD_DEPTH; ++q)
-				{
-					if (mask)
-						X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
-					else
-						X_[q][id] = 0;
-					K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
-				}
+            for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
+            {
+                for (uint q = 0; q < LOAD_DEPTH; ++q)
+                {
+                    if (mask)
+                        X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
+                    else
+                        X_[q][id] = 0;
+                    K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
+                }
-				GroupMemoryBarrierWithGroupSync();
+                GroupMemoryBarrierWithGroupSync();
-				for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-					[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx) 
-						[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-						{
-							v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
-						}
+                for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
+                    [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx) 
+                        [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
+                        {
+                            v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
+                        }
-				GroupMemoryBarrierWithGroupSync();
-			}
-		}
-	}
+                GroupMemoryBarrierWithGroupSync();
+            }
+        }
+    }
-	[unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-		[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-		{
-			uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
-			uint saveX = saveNYX % width;
-			uint saveNY = saveNYX / width;
-			uint saveY = saveNY % height;
-			uint saveN = saveNY / height;
+    [unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
+        [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
+        {
+            uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
+            uint saveX = saveNYX % width;
+            uint saveNY = saveNYX / width;
+            uint saveY = saveNY % height;
+            uint saveN = saveNY / height;
-			uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
-			O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
-		}
+            uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
+            O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
+        }
-	#undef X_
-	#undef K_
+    #undef X_
+    #undef K_
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
-#pragma kernel ScaleBias 	
+#pragma kernel ScaleBias     
+#pragma kernel ScaleBias_CNyx2
+#pragma kernel ScaleBias_Flat
 #pragma kernel Upsample2D
 #pragma kernel AvgPool2D
 #pragma kernel MaxPool2D
 #pragma kernel InstanceNorm
 #pragma kernel Copy

+/*
+ScaleBias_Flat+ScaleBias_CNyx2 (NEW) vs ScaleBias+ScaleBias_CNyx
+Compute Precompiled
+
+MOBILENET@4
+<<<Exec #64:  66.5 ms, cpu: 7.7 ms, avg:  66.3 ms, result:OK    <--- NEW!
+<<<Exec #64:  66.7 ms, cpu: 8.0 ms, avg:  67.1 ms, result:OK
+*/
+
 #include "Tensor.cginc"

 TENSOR_DECL(X)
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void ScaleBias(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	float bias = B.Get(0, 0, 0, c);
-	float scale = W.Get(0, 0, 0, c);
+    float bias = B.Get(0, 0, 0, c);
+    float scale = W.Get(0, 0, 0, c);
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		v = v * scale + bias;
-		O.Set(n, y, x, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        v = v * scale + bias;
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+
+    uint c = dispatchThreadID.x;
+    uint nyx = dispatchThreadID.y;
+
+    uint x = nyx % X.width;
+    uint ny = nyx / X.width;
+    uint y = ny % X.height;
+    uint n = ny / X.height;
+
+    if (c >= X.channels) return;
+    if (n >= X.batch) return;
+
+    float bias = B.Get(0, 0, 0, c);
+    float scale = W.Get(0, 0, 0, c);
-	uint c = dispatchThreadID.x;
-	uint nyx = dispatchThreadID.y;
+    float v = X.Get(n, y, x, c);
+    v = v * scale + bias;
+    O.Set(n, y, x, c, v);
+}
-	uint x = nyx % X.width;
-	uint ny = nyx / X.width;
-	uint y = ny % X.height;
-	uint n = ny / X.height;
+NUMTHREADS((256,1,1), (128,1,1), (64,1,1))
+void ScaleBias_Flat(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.length, 1, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	if (c >= X.channels) return;
-	if (n >= X.batch) return;
+    uint i = dispatchThreadID.x;
+    if (i > O.GetLength()) return;
-	float bias = B.Get(0, 0, 0, c);
-	float scale = W.Get(0, 0, 0, c);
+    uint c = i % X.channels;
+    float bias = B.Get(c);
+    float scale = W.Get(c);
-	float v = X.Get(n, y, x, c);
-	v = v * scale + bias;
-	O.Set(n, y, x, c, v);
+    float v = X.Get(i);
+    v = v * scale + bias;
+    O.Set(i, v);
+}
+
+NUMTHREADS((32,4,1), (32,2,1), (16,2,1))
+void ScaleBias_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+
+    uint c = dispatchThreadID.x;
+    uint i = dispatchThreadID.y * X.channels + c;
+
+    if (c >= X.channels) return;
+    if (i >= X.GetLength()) return;
+
+    float bias = B.Get(c);
+    float scale = W.Get(c);
+
+    float v = X.Get(i);
+    v = v * scale + bias;
+    O.Set(i, v);
-	// NOTE: dispatched over X (not O)
-	DISPATCH_ARGS(X.channels, X.width, X.height);
-	TENSOR_ARGS2(X, O);
+    // NOTE: dispatched over X (not O)
+    DISPATCH_ARGS(X.channels, X.width, X.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= X.channels) return;
-	if (x >= X.width) return;
-	if (y >= X.height) return;
+    if (c >= X.channels) return;
+    if (x >= X.width) return;
+    if (y >= X.height) return;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
-		for (uint dy = 0; dy < _Pool.y; ++dy)
-			for (uint dx = 0; dx < _Pool.x; ++dx)
-			{
-				uint oy = y * _Pool.y + dy;
-				uint ox = x * _Pool.x + dx;
-				O.Set(n, oy, ox, c, v);
-			}
-	}
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Pool.y + dy;
+                uint ox = x * _Pool.x + dx;
+                O.Set(n, oy, ox, c, v);
+            }
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float maxV = -FLT_MAX;
-		for (uint dy = 0; dy < _Pool.y; ++dy)
-			for (uint dx = 0; dx < _Pool.x; ++dx)
-			{
-				uint oy = y * _Stride.y + dy;
-				uint ox = x * _Stride.x + dx;
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float maxV = -FLT_MAX;
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
-				bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
-				float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
-				maxV = max(v, maxV);
-			}
-		
-		O.Set(n, y, x, c, maxV);
-	}
+                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
+                float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
+                maxV = max(v, maxV);
+            }
+        
+        O.Set(n, y, x, c, maxV);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float acc = 0;
-		float counter = 0;
-		for (uint dy = 0; dy < _Pool.y; ++dy)
-			for (uint dx = 0; dx < _Pool.x; ++dx)
-			{
-				uint oy = y * _Stride.y + dy;
-				uint ox = x * _Stride.x + dx;
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float acc = 0;
+        float counter = 0;
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
-				bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
-				acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
-				counter += (mask)? 1: 0;
-			}
-		
-		acc /= counter;
-		O.Set(n, y, x, c, acc);
-	}
+                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
+                acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
+                counter += (mask)? 1: 0;
+            }
+        
+        acc /= counter;
+        O.Set(n, y, x, c, acc);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float maxV = -FLT_MAX;
-		for (uint dy = 0; dy < _Pool[1]; ++dy)
-			for (uint dx = 0; dx < _Pool[0]; ++dx)
-			{
-				float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
-				maxV = max(v, maxV);
-			}
-		
-		O.Set(n, y, x, c, maxV);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float maxV = -FLT_MAX;
+        for (uint dy = 0; dy < _Pool[1]; ++dy)
+            for (uint dx = 0; dx < _Pool[0]; ++dx)
+            {
+                float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
+                maxV = max(v, maxV);
+            }
+        
+        O.Set(n, y, x, c, maxV);
+    }
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = 0;
-		for (uint dy = 0; dy < _Pool[1]; ++dy)
-			for (uint dx = 0; dx < _Pool[0]; ++dx)
-				v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
+    float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = 0;
+        for (uint dy = 0; dy < _Pool[1]; ++dy)
+            for (uint dx = 0; dx < _Pool[0]; ++dx)
+                v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
-		O.Set(n, y, x, c, v);
-	}
+        O.Set(n, y, x, c, v);
+    }
 }

 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-	DISPATCH_ARGS(O.channels, O.width, O.height);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (c >= O.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v0 = X.Get(n, y*2,   x*2,   c);
-		float v1 = X.Get(n, y*2+1, x*2,   c);
-		float v2 = X.Get(n, y*2,   x*2+1, c);
-		float v3 = X.Get(n, y*2+1, x*2+1, c);
-		float v = max(v0, max(v1, max(v2, v3)));
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v0 = X.Get(n, y*2,   x*2,   c);
+        float v1 = X.Get(n, y*2+1, x*2,   c);
+        float v2 = X.Get(n, y*2,   x*2+1, c);
+        float v3 = X.Get(n, y*2+1, x*2+1, c);
+        float v = max(v0, max(v1, max(v2, v3)));
-		O.Set(n, y, x, c, v);
-	}
+        O.Set(n, y, x, c, v);
+    }
-	DISPATCH_ARGS(O.channels, 1, 1);
-	TENSOR_ARGS2(X, O);
+    DISPATCH_ARGS(O.channels, 1, 1);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;
-	if (c >= O.channels) return;
-	//ASSERT(X.batch == O.batch)
+    uint c = dispatchThreadID.x;
+    if (c >= O.channels) return;
+    //ASSERT(X.batch == O.batch)
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = 0;
-		for (uint y = 0; y < X.height; ++y)
-			for (uint x = 0; x < X.width; ++x)
-				v += X.Get(n, y, x, c);
-		
-		v /= (X.height * X.width);
-		O.Set(n, 0, 0, c, v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = 0;
+        for (uint y = 0; y < X.height; ++y)
+            for (uint x = 0; x < X.width; ++x)
+                v += X.Get(n, y, x, c);
+        
+        v /= (X.height * X.width);
+        O.Set(n, 0, 0, c, v);
+    }
-	DISPATCH_ARGS(O.channels, 1, 1);
-	TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
+    DISPATCH_ARGS(O.channels, 1, 1);
+    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-	uint c = dispatchThreadID.x;
-	if (c >= O.channels) return;
-	//ASSERT(X.shape == O.shape)
+    uint c = dispatchThreadID.x;
+    if (c >= O.channels) return;
+    //ASSERT(X.shape == O.shape)
-	float gamma = W.Get(0, 0, 0, c);
-	float beta = B.Get(0, 0, 0, c);
+    float gamma = W.Get(0, 0, 0, c);
+    float beta = B.Get(0, 0, 0, c);
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		uint x, y;
-		// calc mean
-		float acc = 0;
-		for (y = 0; y < O.height; ++y)
-			for (x = 0; x < O.width; ++x)
-				acc += X.Get(n, y, x, c);
-		float mean = acc / (O.width * O.height);
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        uint x, y;
+        // calc mean
+        float acc = 0;
+        for (y = 0; y < O.height; ++y)
+            for (x = 0; x < O.width; ++x)
+                acc += X.Get(n, y, x, c);
+        float mean = acc / (O.width * O.height);
-		// calc variance
-		acc = 0;
-		for (y = 0; y < O.height; ++y)
-			for (x = 0; x < O.width; ++x)
-			{
-				float delta = X.Get(n, y, x, c) - mean;
-				acc += delta * delta;
-			}
-		float var = acc / (O.width * O.height);
+        // calc variance
+        acc = 0;
+        for (y = 0; y < O.height; ++y)
+            for (x = 0; x < O.width; ++x)
+            {
+                float delta = X.Get(n, y, x, c) - mean;
+                acc += delta * delta;
+            }
+        float var = acc / (O.width * O.height);
-		// normalization factor
-		float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
+        // normalization factor
+        float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
-		float scale = gamma * invNormFactor;
-		float bias = beta - gamma * mean * invNormFactor;
+        float scale = gamma * invNormFactor;
+        float bias = beta - gamma * mean * invNormFactor;
-		// apply normalization
-		for (y = 0; y < O.height; ++y)
-			for (x = 0; x < O.width; ++x)
-			{
-				float v = X.Get(n, y, x, c);
-				v = v * scale + bias;
-				O.Set(n, y, x, c, v);
-			}
-	}
+        // apply normalization
+        for (y = 0; y < O.height; ++y)
+            for (x = 0; x < O.width; ++x)
+            {
+                float v = X.Get(n, y, x, c);
+                v = v * scale + bias;
+                O.Set(n, y, x, c, v);
+            }
+    }
-	// NOTE: dispatched over X (not O)
-	DISPATCH_ARGS(X.channels, X.width, X.height);
-	TENSOR_ARGS2(X, O);
+    // NOTE: dispatched over X (not O)
+    DISPATCH_ARGS(X.channels, X.width, X.height);
+    TENSOR_ARGS2(X, O);
-	uint c = dispatchThreadID.x;	uint x = dispatchThreadID.y;	uint y = dispatchThreadID.z;
-	if (c >= X.channels) return;	if (x >= X.width) return;		if (y >= X.height) return;
+    uint c = dispatchThreadID.x;    uint x = dispatchThreadID.y;    uint y = dispatchThreadID.z;
+    if (c >= X.channels) return;    if (x >= X.width) return;       if (y >= X.height) return;
-	for (uint n = 0; n < X.batch; ++n)
-	{
-		float v = X.Get(n, y, x, c);
-		O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
-	}
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+        O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
+    }
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
 // Copyright: Copyleft 2012 :-)
 float RandomUsingCos(float4 seed)
 {
-	float4 K1 = float4(		// Transcendental numbers:
-		0.64341054629,     	// (Cahen's constant)
-		23.14069263277926,	// e^pi (Gelfond's constant)
-		2.665144142690225,	// 2^sqrt(2) (Gelfond-Schneider constant)
-		3.14159265359		// pi
-	);
-	return frac(cos(dot(seed, K1)) * 12345.6789);
+    float4 K1 = float4(         // Transcendental numbers:
+        0.64341054629,          // (Cahen's constant)
+        23.14069263277926,      // e^pi (Gelfond's constant)
+        2.665144142690225,      // 2^sqrt(2) (Gelfond-Schneider constant)
+        3.14159265359           // pi
+    );
+    return frac(cos(dot(seed, K1)) * 12345.6789);
 }

 // Based on: https://stackoverflow.com/questions/4200224/random-noise-functions-for-glsl
 // A single iteration of Bob Jenkins' One-At-A-Time hashing algorithm.
 uint hash(uint x)
 {
-	x += ( x << 10u );
-	x ^= ( x >>  6u );
-	x += ( x <<  3u );
-	x ^= ( x >> 11u );
-	x += ( x << 15u );
-	return x;
+    x += ( x << 10u );
+    x ^= ( x >>  6u );
+    x += ( x <<  3u );
+    x ^= ( x >> 11u );
+    x += ( x << 15u );
+    return x;
 }
 uint hash( uint2 v ) { return hash( v.x ^ hash(v.y)                         ); }
 uint hash( uint3 v ) { return hash( v.x ^ hash(v.y) ^ hash(v.z)             ); }
 // All zeroes yields 0.0, all ones yields the next smallest representable value below 1.0.
 float floatConstruct(uint m)
 {
-	const uint ieeeMantissa = 0x007FFFFFu;	// binary32 mantissa bitmask
-	const uint ieeeOne      = 0x3F800000u;	// 1.0 in IEEE binary32
+    const uint ieeeMantissa = 0x007FFFFFu;  // binary32 mantissa bitmask
+    const uint ieeeOne      = 0x3F800000u;  // 1.0 in IEEE binary32
-	m &= ieeeMantissa;						// Keep only mantissa bits (fractional part)
-	m |= ieeeOne;							// Add fractional part to 1.0
+    m &= ieeeMantissa;                      // Keep only mantissa bits (fractional part)
+    m |= ieeeOne;                           // Add fractional part to 1.0
-	float  f = asfloat(m);					// Range [1:2]
-	return f - 1.0;							// Range [0:1]
+    float  f = asfloat(m);                  // Range [1:2]
+    return f - 1.0;                         // Range [0:1]
-	return floatConstruct(hash(asuint(seed)));
+    return floatConstruct(hash(asuint(seed)));
 }



 float Random(float4 seed)
 {
-	return RandomUsingCos(seed);
+    return RandomUsingCos(seed);
-	return Random(seed) <= p ? 1: 0;
+    return Random(seed) <= p ? 1: 0;
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc

 struct Tensor
 {
-	// @TODO: actually uint seems not like a good idea anymore, consider going to int
-	uint batch, height, width, channels;
+    // @TODO: actually uint seems not like a good idea anymore, consider going to int
+    uint batch, height, width, channels;
-	void Init(uint4 nhwc)
-	{
-		batch = nhwc.x;
-		height = nhwc.y;
-		width = nhwc.z;
-		channels = nhwc.w;
-	}
+    void Init(uint4 nhwc)
+    {
+        batch = nhwc.x;
+        height = nhwc.y;
+        width = nhwc.z;
+        channels = nhwc.w;
+    }
-	uint4 Dims()
-	{
-		return uint4(batch, height, width, channels);
-	}
-	uint GetFlatHeight()
-	{
-		return batch;
-	}
-	uint GetFlatWidth()
-	{
-		return height * width * channels;
-	}
-	uint GetKernelHeight()
-	{
-		// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
-		uint kernelHeight = batch;
-		return kernelHeight;
-	}
-	uint GetKernelWidth()
-	{
-		// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
-		uint kernelWidth = height;
-		return kernelWidth;
-	}
+    uint4 Dims()
+    {
+        return uint4(batch, height, width, channels);
+    }
+    uint GetFlatHeight()
+    {
+        return batch;
+    }
+    uint GetFlatWidth()
+    {
+        return height * width * channels;
+    }
+    uint GetKernelHeight()
+    {
+        // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
+        uint kernelHeight = batch;
+        return kernelHeight;
+    }
+    uint GetKernelWidth()
+    {
+        // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
+        uint kernelWidth = height;
+        return kernelWidth;
+    }
+    uint GetKernelDepth()
+    {
+        // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
+        uint kernelDepth = width;
+        return kernelDepth;
+    }
+    uint GetKernelCount()
+    {
+        // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
+        uint kernelCount = channels;
+        return kernelCount;
+    }
+    uint GetLength()
+    {
+        return batch * height * width * channels;
+    }
-	uint Index(uint b, uint h, uint w, uint ch)
-	{
-		uint index =
-			b * height * width * channels +
-			h * width * channels +
-			w * channels +
-			ch;
-		return index;
-	}
+    uint Index(uint b, uint h, uint w, uint ch)
+    {
+        uint index =
+            b * height * width * channels +
+            h * width * channels +
+            w * channels +
+            ch;
+        return index;
+    }
-	uint Index(uint b, uint i)
-	{
-		uint index =
-			b * height * width * channels +
-			i;
-		return index;
-	}
+    uint Index(uint b, uint i)
+    {
+        uint index =
+            b * height * width * channels +
+            i;
+        return index;
+    }
-	StructuredBuffer<float> data;
+    StructuredBuffer<float> data;
-	void Init(uint4 nhwc, StructuredBuffer<float> data_)
-	{
-		Tensor::Init(nhwc);
-		data = data_;
-	}
+    void Init(uint4 nhwc, StructuredBuffer<float> data_)
+    {
+        Tensor::Init(nhwc);
+        data = data_;
+    }
-	float Get(uint b, uint h, uint w, uint ch)
-	{
-		return data[Index(b,h,w,ch)];
-	}
-	float Get(uint b, uint2 pos, uint ch)
-	{
-		return data[Index(b, pos.y, pos.x, ch)];
-	}
-	float Get(uint b, uint i)
-	{
-		return data[Index(b,i)];
-	}
-	float Get(uint i)
-	{
-		return data[i];
-	}
+    float Get(uint b, uint h, uint w, uint ch)
+    {
+        return data[Index(b,h,w,ch)];
+    }
+    float Get(uint b, uint2 pos, uint ch)
+    {
+        return data[Index(b, pos.y, pos.x, ch)];
+    }
+    float Get(uint b, uint i)
+    {
+        return data[Index(b,i)];
+    }
+    float Get(uint i)
+    {
+        return data[i];
+    }
-	float BroadcastGet(uint b, uint h, uint w, uint ch)
-	{
-		return Get(b % batch, h % height, w % width, ch % channels);
-	}
-	float BroadcastGet(uint b, uint2 pos, uint ch)
-	{
-		return BroadcastGet(b, pos.y, pos.x, ch);
-	}
-	float BroadcastGet(uint b, uint i)
-	{
-		return Get(b % GetFlatHeight(), i % GetFlatWidth());
-	}
+    float BroadcastGet(uint b, uint h, uint w, uint ch)
+    {
+        return Get(b % batch, h % height, w % width, ch % channels);
+    }
+    float BroadcastGet(uint b, uint2 pos, uint ch)
+    {
+        return BroadcastGet(b, pos.y, pos.x, ch);
+    }
+    float BroadcastGet(uint b, uint i)
+    {
+        return Get(b % GetFlatHeight(), i % GetFlatWidth());
+    }
-	float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
-	{
-		if (b >= batch || ch >= channels) return 0;
+    float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
+    {
+        if (b >= batch || ch >= channels) return 0;
-		if (any(pos < pad)) return 0;
-		if (any(pos >= uint2(width, height) + pad)) return 0;
-		pos -= pad;
+        if (any(pos < pad)) return 0;
+        if (any(pos >= uint2(width, height) + pad)) return 0;
+        pos -= pad;
-		return data[Index(b, pos.y, pos.x, ch)];
-	}
-	float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
-	{
-		return SafeGet(b, uint2(w, h), ch, pad);
-	}
-	float SafeGet(uint b, uint i)
-	{
-		if (b >= batch || i >= height * width * channels) return 0;
-		return Get(b,i);
-	}
-	float SafeGet(uint i)
-	{
-		if (i >= batch * height * width * channels) return 0;
-		return Get(i);
-	}
+        return data[Index(b, pos.y, pos.x, ch)];
+    }
+    float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
+    {
+        return SafeGet(b, uint2(w, h), ch, pad);
+    }
+    float SafeGet(uint b, uint i)
+    {
+        if (b >= batch || i >= height * width * channels) return 0;
+        return Get(b,i);
+    }
+    float SafeGet(uint i)
+    {
+        if (i >= batch * height * width * channels) return 0;
+        return Get(i);
+    }
-	RWStructuredBuffer<float> data;
+    RWStructuredBuffer<float> data;
-	void Init(int4 nhwc, RWStructuredBuffer<float> data_)
-	{
-		Tensor::Init(nhwc);
-		data = data_;
-	}
+    void Init(int4 nhwc, RWStructuredBuffer<float> data_)
+    {
+        Tensor::Init(nhwc);
+        data = data_;
+    }
-	float Get(uint b, uint h, uint w, uint ch)
-	{
-		return data[Index(b,h,w,ch)];
-	}
-	float Get(uint b, uint2 pos, uint ch)
-	{
-		return data[Index(b, pos.y, pos.x, ch)];
-	}
-	float Get(uint b, uint i)
-	{
-		return data[Index(b,i)];
-	}
-	float Get(uint i)
-	{
-		return data[i];
-	}
+    float Get(uint b, uint h, uint w, uint ch)
+    {
+        return data[Index(b,h,w,ch)];
+    }
+    float Get(uint b, uint2 pos, uint ch)
+    {
+        return data[Index(b, pos.y, pos.x, ch)];
+    }
+    float Get(uint b, uint i)
+    {
+        return data[Index(b,i)];
+    }
+    float Get(uint i)
+    {
+        return data[i];
+    }
-	float BroadcastGet(uint b, uint h, uint w, uint ch)
-	{
-		return Get(b % batch, h % height, w % width, ch % channels);
-	}
-	float BroadcastGet(uint b, uint2 pos, uint ch)
-	{
-		return BroadcastGet(b, pos.y, pos.x, ch);
-	}
-	float BroadcastGet(uint b, uint i)
-	{
-		return Get(b % GetFlatHeight(), i % GetFlatWidth());
-	}
+    float BroadcastGet(uint b, uint h, uint w, uint ch)
+    {
+        return Get(b % batch, h % height, w % width, ch % channels);
+    }
+    float BroadcastGet(uint b, uint2 pos, uint ch)
+    {
+        return BroadcastGet(b, pos.y, pos.x, ch);
+    }
+    float BroadcastGet(uint b, uint i)
+    {
+        return Get(b % GetFlatHeight(), i % GetFlatWidth());
+    }
-	float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
-	{
-		if (b >= batch || ch >= channels) return 0;
+    float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
+    {
+        if (b >= batch || ch >= channels) return 0;
-		if (any(pos < pad)) return 0;
-		if (any(pos >= uint2(width, height) + pad)) return 0;
-		pos -= pad;
+        if (any(pos < pad)) return 0;
+        if (any(pos >= uint2(width, height) + pad)) return 0;
+        pos -= pad;
-		return Get(b, pos.y, pos.x, ch);
-	}
-	float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
-	{
-		return SafeGet(b, uint2(w, h), ch, pad);
-	}
-	float SafeGet(uint b, uint i)
-	{
-		if (b >= batch || i >= height * width * channels) return 0;
-		return Get(b,i);
-	}
-	float SafeGet(uint i)
-	{
-		if (i >= batch * height * width * channels) return 0;
-		return Get(i);
-	}
+        return Get(b, pos.y, pos.x, ch);
+    }
+    float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
+    {
+        return SafeGet(b, uint2(w, h), ch, pad);
+    }
+    float SafeGet(uint b, uint i)
+    {
+        if (b >= batch || i >= height * width * channels) return 0;
+        return Get(b,i);
+    }
+    float SafeGet(uint i)
+    {
+        if (i >= batch * height * width * channels) return 0;
+        return Get(i);
+    }
-	void Set(uint b, uint h, uint w, uint ch, float v)
-	{
-		data[Index(b,h,w,ch)] = v;
-	}
-	void Set(uint y, uint x, float v)
-	{
-		data[Index(y,x)] = v;
-	}
-	void Set(uint i, float v)
-	{
-		data[i] = v;
-	}
+    void Set(uint b, uint h, uint w, uint ch, float v)
+    {
+        data[Index(b,h,w,ch)] = v;
+    }
+    void Set(uint y, uint x, float v)
+    {
+        data[Index(y,x)] = v;
+    }
+    void Set(uint i, float v)
+    {
+        data[i] = v;
+    }
-	StructuredBuffer<float> data;
-	uint offset;
+    StructuredBuffer<float> data;
+    uint offset;
-	void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
-	{
-		Tensor::Init(nhwc);
-		data = data_;
-		offset = info.x;
-	}
+    void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
+    {
+        Tensor::Init(nhwc);
+        data = data_;
+        offset = info.x;
+    }
-	float Get(uint b, uint h, uint w, uint ch)
-	{
-		return data[Index(b,h,w,ch) + offset];
-	}
-	float Get(uint b, uint2 pos, uint ch)
-	{
-		return Get(b, pos.y, pos.x, ch);
-	}
-	float Get(uint b, uint i)
-	{
-		return data[Index(b,i) + offset];
-	}
-	float Get(uint i)
-	{
-		return data[i + offset];
-	}
+    float Get(uint b, uint h, uint w, uint ch)
+    {
+        return data[Index(b,h,w,ch) + offset];
+    }
+    float Get(uint b, uint2 pos, uint ch)
+    {
+        return Get(b, pos.y, pos.x, ch);
+    }
+    float Get(uint b, uint i)
+    {
+        return data[Index(b,i) + offset];
+    }
+    float Get(uint i)
+    {
+        return data[i + offset];
+    }
-	float BroadcastGet(uint b, uint h, uint w, uint ch)
-	{
-		return Get(b % batch, h % height, w % width, ch % channels);
-	}
-	float BroadcastGet(uint b, uint2 pos, uint ch)
-	{
-		return BroadcastGet(b, pos.y, pos.x, ch);
-	}
-	float BroadcastGet(uint b, uint i)
-	{
-		return Get(b % GetFlatHeight(), i % GetFlatWidth());
-	}
+    float BroadcastGet(uint b, uint h, uint w, uint ch)
+    {
+        return Get(b % batch, h % height, w % width, ch % channels);
+    }
+    float BroadcastGet(uint b, uint2 pos, uint ch)
+    {
+        return BroadcastGet(b, pos.y, pos.x, ch);
+    }
+    float BroadcastGet(uint b, uint i)
+    {
+        return Get(b % GetFlatHeight(), i % GetFlatWidth());
+    }
-	float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
-	{
-		if (b >= batch || ch >= channels) return 0;
+    float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
+    {
+        if (b >= batch || ch >= channels) return 0;
-		if (any(pos < pad)) return 0;
-		if (any(pos >= uint2(width, height) + pad)) return 0;
-		pos -= pad;
+        if (any(pos < pad)) return 0;
+        if (any(pos >= uint2(width, height) + pad)) return 0;
+        pos -= pad;
-		return Get(b, pos, ch);
-	}
-	float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
-	{
-		return SafeGet(b, uint2(w, h), ch, pad);
-	}
-	float SafeGet(uint b, uint i)
-	{
-		if (b >= batch || i >= height * width * channels) return 0;
-		return Get(b,i);
-	}
-	float SafeGet(uint i)
-	{
-		if (i >= batch * height * width * channels) return 0;
-		return Get(i);
-	}
+        return Get(b, pos, ch);
+    }
+    float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
+    {
+        return SafeGet(b, uint2(w, h), ch, pad);
+    }
+    float SafeGet(uint b, uint i)
+    {
+        if (b >= batch || i >= height * width * channels) return 0;
+        return Get(b,i);
+    }
+    float SafeGet(uint i)
+    {
+        if (i >= batch * height * width * channels) return 0;
+        return Get(i);
+    }
 };

 #define TENSOR_DECL(X) uint4 X##decl[2]; StructuredBuffer<float> X##data;

 float fastfma(float a, float b, float c)
 {
-	return dot(float2(a,c), float2(b, 1));
+    return dot(float2(a,c), float2(b, 1));
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute

 struct TextureAsTensor : Tensor
 {
-	Texture2D<float4> tex;
-	SamplerState smp;
+    Texture2D<float4> tex;
+    SamplerState smp;
-	Texture2DArray<float4> texArray;
-	SamplerState smpArray;
+    Texture2DArray<float4> texArray;
+    SamplerState smpArray;
-	void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
-	{
-		Tensor::Init(nhwc);
-		tex = tex_;
-		smp = sampler_;
-		texArray = texArray_;
-		smpArray = samplerArray_;
-	}
+    void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
+    {
+        Tensor::Init(nhwc);
+        tex = tex_;
+        smp = sampler_;
+        texArray = texArray_;
+        smpArray = samplerArray_;
+    }
-	float4 Get(uint b, uint y, uint x)
-	{
-		float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
-		if (batch > 1)
-			return texArray.SampleLevel(smpArray, loc, 0);
-		else
-			return tex.SampleLevel(smp, loc.xy, 0);
-	}
+    float4 Get(uint b, uint y, uint x)
+    {
+        float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
+        if (batch > 1)
+            return texArray.SampleLevel(smpArray, loc, 0);
+        else
+            return tex.SampleLevel(smp, loc.xy, 0);
+    }
 };

 #define TENSOR_SHARED2_ARGS3(A, B, S, O) TENSOR_SHARED_ARG(A, S); TENSOR_SHARED_ARG(B, S); TENSOR_ARG_RW(O);
 {
 // @TODO: currently it fails to compile, needs to be investigated
 #if 0
-	DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-	TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
+    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
+    TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
-	TENSOR_SHARED_ARG(K, WBK);
-	TENSOR_SHARED_ARG(B, WBK);
-	TENSOR_ARG_RW(O);
+    TENSOR_SHARED_ARG(K, WBK);
+    TENSOR_SHARED_ARG(B, WBK);
+    TENSOR_ARG_RW(O);
-	// ASSERT(X.channels <= MAX_CHANNELS)
+    // ASSERT(X.channels <= MAX_CHANNELS)
-	uint k = dispatchThreadID.x;
-	uint x = dispatchThreadID.y;
-	uint y = dispatchThreadID.z;
+    uint k = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
-	if (k >= K.channels) return;
-	if (x >= O.width) return;
-	if (y >= O.height) return;
+    if (k >= K.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
-	for (uint n = 0; n < O.batch; ++n)
-	{
-		float acc = B.Get(k);
-		for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-		{
-			for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-			{
-				uint oy = y * _Stride.y + dy;
-				uint ox = x * _Stride.x + dx;
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float acc = B.Get(k);
+        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
+        {
+            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
-				// @TODO: investigate
-				// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-				if (oy < _Pad.y) continue;
-				if (oy - _Pad.w >= X.height) continue;
-				if (ox < _Pad.x) continue;
-				if (ox - _Pad.z >= X.width) continue;
+                // @TODO: investigate
+                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
+                if (oy < _Pad.y) continue;
+                if (oy - _Pad.w >= X.height) continue;
+                if (ox < _Pad.x) continue;
+                if (ox - _Pad.z >= X.width) continue;
-				float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
-				for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
-				{
-					acc += in4channels[c] * K.Get(dy, dx, c, k);
-				}
-			}
-		}
+                float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
+                for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
+                {
+                    acc += in4channels[c] * K.Get(dy, dx, c, k);
+                }
+            }
+        }
-		O.Set(n, y, x, k, acc);
-	}
+        O.Set(n, y, x, k, acc);
+    }
 #endif
 }
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
 # Release notes

+## 0.2.0
+- Version bumped to 0.2.0 as it brings breaking API changes, for details look below. 
+- Significantly reduced temporary memory allocations by introducing internal allocator support. Now memory is re-used between layer execution as much as possible.
+- Improved small workload performance on CSharp backend
+- Added parallel implementation for multiple activation functions on CSharp backend
+- Added `Peek()` function to `IWorker`, it retains object storage in worker's allocator, useful for quick grabbing of output. If you want to preserve content of output tensor between `Execute()` invocations, then use `Fetch()`.
+- Fixed ESRGAN model conversion (ONNX importer).
+- Fixed Tensor <-> Texture copy for textures/tensors that dimensions are not multiple of 8.
+- Added `Summary()` method to `Worker`. Currently returns allocator information.
+- Tabs to spaces! Aiming at higher salary (https://stackoverflow.blog/2017/06/15/developers-use-spaces-make-money-use-tabs/).
+- Renamed worker type enum members: `CSharp` -> `CSharpRef`, `CSharpFast` -> `CSharp`, `Compute` -> `ComputeRef`, `ComputeFast` -> `Compute`.
+- Implemented new optimized `ComputePrecompiled` worker. This worker caches Compute kernels and state beforehand to reduce CPU overhead. 
+- Added `ExecuteAsync()` to `IWorker` interface, it returns `IEnumerator`, which enables you to control how many layers to schedule per frame (one iteration == one layer).
+- Added `Log` op support on Compute workers.
+- Optimized activation functions and ScaleBias by accessing tensor as continuous array. Gained ~2.0ms on 4 batch MobileNet (MBP2016).
+- Introduced _Loop version of activations to fight 65535 scheduling limit on D3D11.
+- Added .nn as Barracuda model file extension for use in Unity Editor. Also added simple editor importer. Now you can declare serializable fields as NNModel to bind them to .nn asset. ModelLoader.Load() now accepts NNModel as a source.
+- Compute: Reduce reference GPU implementation.
+- TF importer: Expanded Mean support to mean over channels, implemented Pad (as Border2D), implemented SquaredDifference, added InstanceNormalization and LeakyRelu patterns, StridedSlice implementation.
+- TF importer: sort model nodes by dependencies before processing.
+- Fixed ComputeBuffer leak when using Compute and ComputePrecompiled backends.
+- Made to use Conv2D_L1Cached64_RegisterBlock4x4 more often: improves perf ~2x on Vega 16, and ~30% on Nvidia and Intel.
+
 ## 0.1.6
 - Added activation type print in verbose mode
 - Added fast and parallel CPU implementation for Swish, Relu, Add, Sub, Div, Min, Max, Tanh, Exp
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
 {
    "name": "com.unity.barracuda",
    "displayName": "Barracuda",
-    "version": "0.1.6-preview",
+    "version": "0.2.0-preview",
    "unity": "2017.4",
    "description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.",
    "dependencies": {}
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
        var elementType = src.GetType().GetElementType();
        var elementSize = Marshal.SizeOf(elementType);
        var dest = Array.CreateInstance(elementType, shape);
-        Buffer.BlockCopy(src, 0, dest, 0, src.Length * elementSize);
+        Buffer.BlockCopy(src, 0, dest, 0, dest.Length * elementSize);
        return dest;
    }
    
--- a/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
                
                _barracudaModel = ModelLoader.Load(model.Value);
                var executionDevice = inferenceDevice == InferenceDevice.GPU
-                    ? BarracudaWorkerFactory.Type.ComputeFast
-                    : BarracudaWorkerFactory.Type.CSharpFast;
+                    ? BarracudaWorkerFactory.Type.ComputePrecompiled
+                    : BarracudaWorkerFactory.Type.CSharp;
                                       
                _engine = BarracudaWorkerFactory.CreateWorker(executionDevice, _barracudaModel, _verbose);
            }
            var outputs = new List<Tensor>();
            foreach (var name in names)
            {
-                var outp = _engine.Fetch(name);
+                var outp = _engine.Peek(name);
-                outp.Dispose();
            }

            return outputs;
--- a/ml-agents/mlagents/trainers/barracuda.py
+++ b/ml-agents/mlagents/trainers/barracuda.py
 from collections import defaultdict
 import numpy as np
 import json
-import struct  # convert from Python values and C structs
+import struct # convert from Python values and C structs
 import re
 import argparse
 import os.path
        self.globals = []
        self.memories = []

-
-
-    def __init__(self, **entries):
-        self.__dict__.update(entries)
-
+    def __init__(self, **entries): self.__dict__.update(entries)
-    parser.add_argument("source_file", help=help)
-    parser.add_argument("target_file", help="output Barracuda binary file")
-    parser.add_argument("-trim", "--trim-unused-by-output")
-    parser.add_argument("--print-layers", action="store_true")
-    parser.add_argument("--print-source-json", action="store_true")
-    parser.add_argument("-json", "--print-barracuda-json", action="store_true")
-    parser.add_argument("--print-layer-links", action="store_true")
-    parser.add_argument("--print-patterns", action="store_true")
-    parser.add_argument("--print-tensors", action="store_true")
-    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument('source_file', help=help)
+    parser.add_argument('target_file', help='output Barracuda binary file')
+    parser.add_argument('-trim', '--trim-unused-by-output')
+    parser.add_argument('--print-layers', action='store_true')
+    parser.add_argument('--print-source-json', action='store_true')
+    parser.add_argument('-json', '--print-barracuda-json', action='store_true')
+    parser.add_argument('--print-layer-links', action='store_true')
+    parser.add_argument('--print-patterns', action='store_true')
+    parser.add_argument('--print-tensors', action='store_true')
+    parser.add_argument('--verbose', action='store_true')
-    args.compress_f16 = (
-        False
-    )  # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
-
-    output_extension = ".bc" if not args.compress_f16 else ".f16.bc"
+    args.compress_f16 = False # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
+    
+    output_extension = '.bc' if not args.compress_f16 else '.f16.bc'
-        print("File", args.source_file, "does not exist.")
+        print('File', args.source_file, 'does not exist.')
-        return os.path.splitext(os.path.basename(filename))[0] + newExtenstion
+        return os.path.splitext(os.path.basename(filename))[0] + newExtenstion;
-        args.target_file = os.path.join(
-            args.target_file,
-            replaceFilenameExtension(args.source_file, output_extension),
-        )
+        args.target_file = os.path.join(args.target_file, replaceFilenameExtension(args.source_file, output_extension))

    if args.verbose:
        print(args)
-
 # Fuse training time BatchNorm tensors into Scale & Bias
 def fuse_batchnorm_weights(gamma, beta, mean, var, epsilon):
    # https://github.com/Tencent/ncnn/blob/master/src/layer/batchnorm.cpp
    bias = beta - gamma * mean / np.sqrt(var + epsilon)
    return [scale, bias]

-
-    if hasattr(model, "layers"):
+    if hasattr(model, 'layers'):
        model = model.layers
    inputs_and_memories = set(list(inputs) + list(memories[1::3]))

            ready.add(l.name)
        return missing

-    # Class to represent a graph
+    # Class to represent a graph 
-    class Graph:
-        def __init__(self, vertices):
-            self.graph = defaultdict(list)  # dictionary containing adjacency List
-            self.V = vertices  # No. of vertices
-
-        # function to add an edge to graph
-        def addEdge(self, u, v):
-            self.graph[u].append(v)
-
-        # A recursive function used by topologicalSort
-        def topologicalSortUtil(self, v, visited, stack):
-
-            # Mark the current node as visited.
+    class Graph: 
+        def __init__(self,vertices): 
+            self.graph = defaultdict(list) #dictionary containing adjacency List 
+            self.V = vertices #No. of vertices 
+      
+        # function to add an edge to graph 
+        def addEdge(self,u,v): 
+            self.graph[u].append(v) 
+      
+        # A recursive function used by topologicalSort 
+        def topologicalSortUtil(self,v,visited,stack): 
+      
+            # Mark the current node as visited. 
+      
+            # Recur for all the vertices adjacent to this vertex 
+            for i in self.graph[v]: 
+                if visited[i] == False: 
+                    self.topologicalSortUtil(i,visited,stack) 
+      
+            # Push current vertex to stack which stores result 
+            stack.insert(0,v) 
-            # Recur for all the vertices adjacent to this vertex
-            for i in self.graph[v]:
-                if visited[i] == False:
-                    self.topologicalSortUtil(i, visited, stack)
-
-            # Push current vertex to stack which stores result
-            stack.insert(0, v)
-
-        # The function to do Topological Sort. It uses recursive
-        # topologicalSortUtil()
-        def topologicalSort(self):
-            # Mark all the vertices as not visited
-            visited = [False] * self.V
-            stack = []
-
-            # Call the recursive helper function to store Topological
-            # Sort starting from all vertices one by one
-            for i in range(self.V):
-                if visited[i] == False:
-                    self.topologicalSortUtil(i, visited, stack)
-
-            # print(stack)
+        # The function to do Topological Sort. It uses recursive  
+        # topologicalSortUtil() 
+        def topologicalSort(self): 
+            # Mark all the vertices as not visited 
+            visited = [False]*self.V 
+            stack =[] 
+      
+            # Call the recursive helper function to store Topological 
+            # Sort starting from all vertices one by one 
+            for i in range(self.V): 
+                if visited[i] == False: 
+                    self.topologicalSortUtil(i,visited,stack) 
+      
+            #print(stack)
-    if len(find_missing_inputs(model, inputs_and_memories)) == 0:
+    if (len(find_missing_inputs(model, inputs_and_memories)) == 0):
        return model

    g = Graph(len(model))
    for l in model:
-        layers[l.name] = id
+        layers[l.name] = id;
        id += 1

    for layer in model:
    print("SORTED:", sorted_layer_indices)
    new_model = [model[idx] for idx in sorted_layer_indices]

-    assert len(find_missing_inputs(new_model, inputs_and_memories)) == 0
+    assert(len(find_missing_inputs(new_model, inputs_and_memories)) == 0)
+
-    if hasattr(model, "layers"):
+    if hasattr(model, 'layers'):
-    def flatten(items, enter=lambda x: isinstance(x, list)):
+    def flatten(items,enter=lambda x:isinstance(x, list)):
        # http://stackoverflow.com/a/40857703
        # https://github.com/ctmakro/canton/blob/master/canton/misc.py
        """Yield items from any nested iterable; see REF."""
                yield x

    def trim_model(model, outputs):
-        layers = {l.name: l for l in model}
+        layers = {l.name:l for l in model}
        connected = {o for o in outputs}
        while len(outputs) > 0:
            outputs = set(flatten([layers[o].inputs for o in outputs if o in layers]))
                connected.add(o)

        trimmed = [l.name for l in model if l.name not in connected]
-
-            return str(arr)[1:-1]  # array to string without brackets
-
+            return str(arr)[1:-1] # array to string without brackets
        print("TRIMMED:", array_without_brackets(trimmed))

        return [l for l in model if l.name in connected]
        print("Trimming model given outputs to preserve:", preserve_outputs)
        model = trim_model(model, preserve_outputs)
    else:
-        print(
-            "WARNING: Trim couldn't find any layers to match:", criteria_regexp_string
-        )
+        print("WARNING: Trim couldn't find any layers to match:", criteria_regexp_string)
-
-    compress_classes = {"Dense"}
+    compress_classes = {
+        'Dense'
+    }
-        if l.class_name in compress_classes:
-            print(
-                "Compressing %s layer '%s' weights to float16" % (l.class_name, l.name)
-            )
+        if (l.class_name in compress_classes):
+            print("Compressing %s layer '%s' weights to float16" % (l.class_name, l.name))
-
-            if isinstance(o, np.ndarray):  # skip binary data packed inside ndarray
+            if isinstance(o, np.ndarray): # skip binary data packed inside ndarray
-            if getattr(o, "__dict__", None):
+            if getattr(o, '__dict__', None):
-    s = json.dumps(model.layers, cls=StructEncoder, separators=(", ", ":"))
+    s = json.dumps(model.layers, cls=StructEncoder, separators=(', ',':'))
-    s = s.replace("]}, {", "]},\n{")
-    s = s.replace(":[{", ":[\n\t{")
-    s = s.replace("}, {", "},\n\t{")
+    s = s.replace(']}, {', ']},\n{')
+    s = s.replace(':[{', ':[\n\t{')
+    s = s.replace('}, {', '},\n\t{')
-
-        return str(arr)[1:-1]  # array to string without brackets
+        return str(arr)[1:-1] # array to string without brackets

    if print_layer_links:
        for l in model.layers:

    if model.globals:
        if isinstance(model.globals, dict):
-            model.globals = {x.name: x.shape for x in model.globals}
+            model.globals = {x.name:x.shape for x in model.globals}
-            ins = {i: model.inputs[i] for i in l.inputs if i in model.inputs}
+            ins = {i:model.inputs[i] for i in l.inputs if i in model.inputs}
        else:
            ins = [i for i in l.inputs if i in model.inputs]
        if ins:
    print("OUT:", array_without_brackets(model.outputs))

-    if print_tensors:
+    if (print_tensors):
-
-    def __init__(self, scope=""):
+    def __init__(self, scope=''):
-        if attr == "_":
+        if attr == '_':
            return self.layers[-1].name if len(self.layer) > 0 else self.scope
        raise AttributeError(attr)


        i = 1
        while name in self.names_taken:
-            name = self.layers[-1].op + "_" + str(i)
+            name = self.layers[-1].op + '_' + str(i)
-        self.layers[-1].name = self.scope + ("/" if self.scope else "") + name
+        self.layers[-1].name = self.scope + ('/' if self.scope else '') + name
-    def concat(self, a, b, out=""):
-        self.layers += [Struct(name=out, op="Concat", input=[a, b])]
+    def concat(self, a, b, axis=-1, out=''):
+        self.layers += [Struct(name=out, op='Concat', axis=axis, input=[a, b])]
+        return self._patch_last_layer_name_and_return()
+    def mad(self, x, kernel, bias, out=''):
+        self.layers += [Struct(name=out, op='Dense', input=[x, kernel, bias])]
+        return self._patch_last_layer_name_and_return()
+    def mul(self, a, b, out=''):
+        self.layers += [Struct(name=out, op='Mul', input=[a, b])]
+        return self._patch_last_layer_name_and_return()
+    def add(self, a, b, out=''):
+        self.layers += [Struct(name=out, op='Add', input=[a, b])]
-
-    def mad(self, x, kernel, bias, out=""):
-        self.layers += [Struct(name=out, op="Dense", input=[x, kernel, bias])]
+    def sub(self, a, b, out=''):
+        self.layers += [Struct(name=out, op='Sub', input=[a, b])]
-
-    def mul(self, a, b, out=""):
-        self.layers += [Struct(name=out, op="Mul", input=[a, b])]
+    def sigmoid(self, x, out=''):
+        self.layers += [Struct(name=out, op='Sigmoid', input=[x])]
-
-    def add(self, a, b, out=""):
-        self.layers += [Struct(name=out, op="Add", input=[a, b])]
+    def tanh(self, x, out=''):
+        self.layers += [Struct(name=out, op='Tanh', input=[x])]
-
-    def sub(self, a, b, out=""):
-        self.layers += [Struct(name=out, op="Sub", input=[a, b])]
+    def reduce(self, op, x, axis=-1, out=''):
+        self.layers += [Struct(name=out, op='Reduce'+op, axis=axis, input=[x])]
-
-    def sigmoid(self, x, out=""):
-        self.layers += [Struct(name=out, op="Sigmoid", input=[x])]
+    def pool(self, op, x, out=''):
+        self.layers += [Struct(name=out, op=op+'Pool', input=[x])]
-
-    def tanh(self, x, out=""):
-        self.layers += [Struct(name=out, op="Tanh", input=[x])]
+    def strided_slice(self, x, begin, end, strides, rank, out=''):
+        self.layers += [Struct(name=out, op='StridedSlice', rank=rank, starts=begin, ends=end, slice_strides=strides, input=[x])]
+def mean(name, input, axis=-1):
+    ''' combines mean operation out of several simpler ops
+    '''
+    nn = Build(name)
+    if np.array_equal(axis, [1,2]):
+        nn.pool('GlobalAvg', input, out=name)
+    elif np.array_equal(axis, [1,2,3]):
+        nn.reduce('Mean',                # over channels
+            nn.pool('GlobalAvg', input), # over height & width
+        out=name) 
+    elif np.array_equal(axis, [3]) or np.array_equal(axis, [-1]) or np.array_equal(axis, 3) or np.array_equal(axis, -1):
+        nn.reduce('Mean', input, out=name)
+    return nn.layers
-def rnn(name, input, state, kernel, bias, new_state, number_of_gates=2):
-    """ - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
-    """
+def rnn(name, input, state, kernel, bias, new_state, number_of_gates = 2):
+    ''' - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
+    '''
-    nn.tanh(nn.mad(kernel=kernel, bias=bias, x=nn.concat(input, state)), out=new_state)
+    nn.tanh(
+        nn.mad(kernel=kernel, bias=bias,
+            x=nn.concat(input, state)),
+        out=new_state);
-
-def gru(
-    name,
-    input,
-    state,
-    kernel_r,
-    kernel_u,
-    kernel_c,
-    bias_r,
-    bias_u,
-    bias_c,
-    new_state,
-    number_of_gates=2,
-):
-    """ - zt = f(Xt*Wz + Ht_1*Rz        + Wbz + Rbz)
+def gru(name, input, state, kernel_r, kernel_u, kernel_c, bias_r, bias_u, bias_c, new_state, number_of_gates = 2):
+    ''' - zt = f(Xt*Wz + Ht_1*Rz        + Wbz + Rbz)
-    """
+    '''
    nn = Build(name)
    inputs = nn.concat(input, state)


-    c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c, x=nn.concat(input, r_state)))
+    c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c,
+        x=nn.concat(input, r_state)))

    # new_h = u' * state + (1 - u') * c'
    #       = u' * state + c' - u' * c'
    # - u' * c'
-    nn.sub(nn._, nn.mul(u, c), out=new_state)
-
-    return nn.layers
+    nn.sub(nn._, nn.mul(u, c),
+        out=new_state)
+    return nn.layers;
-def lstm(
-    name,
-    input,
-    state_c,
-    state_h,
-    kernel_i,
-    kernel_j,
-    kernel_f,
-    kernel_o,
-    bias_i,
-    bias_j,
-    bias_f,
-    bias_o,
-    new_state_c,
-    new_state_h,
-):
-    """ Full:
+def lstm(name, input, state_c, state_h, kernel_i, kernel_j, kernel_f, kernel_o, bias_i, bias_j, bias_f, bias_o, new_state_c, new_state_h):
+    ''' Full:
    - it = f(Xt*Wi + Ht_1*Ri + Pi . Ct_1 + Wbi + Rbi)
    - ft = f(Xt*Wf + Ht_1*Rf + Pf . Ct_1 + Wbf + Rbf)
    - ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)
-    """
+    '''
-    """ No peephole:
+    ''' No peephole:
    - it = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
    - ft = f(Xt*Wf + Ht_1*Rf + Wbf + Rbf)
    - ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)
-    """
+    '''
-    j = nn.tanh(nn.mad(inputs, kernel_j, bias_j))
+    j =    nn.tanh(nn.mad(inputs, kernel_j, bias_j))
-    nn.add(nn.mul(state_c, f), nn.mul(i, j), out=new_state_c)
+    nn.add(
+        nn.mul(state_c, f), nn.mul(i, j),
+        out=new_state_c)
-    # new_h =
-    nn.mul(o, nn.tanh(new_state_c), out=new_state_h)
+    # new_h = 
+    nn.mul(o, nn.tanh(new_state_c),
+        out=new_state_h)
-

 # Serialize
 class BarracudaWriter:
-        self.f = open(filename, "wb+")
+        self.f = open(filename, 'wb+')

    def __enter__(self):
        return self

    def write_str(self, s):
        self.write_int32(len(s))
-        self.f.write(s.encode("ascii"))
+        self.f.write(s.encode('ascii'))
-        self.f.write(struct.pack("<f", d))
+        self.f.write(struct.pack('<f', d))
-        self.f.write(struct.pack("<i", d))
+        self.f.write(struct.pack('<i', d))
-        self.f.write(struct.pack("<q", d))
+        self.f.write(struct.pack('<q', d))

    def write_shape(self, s):
        self.write_int32(len(s))
    def close(self):
        self.f.close()
-
-        # VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
+        #VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
        w.write_int64(BARRACUDA_VERSION)

        # inputs
        w.write_str_array(model.outputs)

        # memories
-        w.write_int32(len(model.memories) // 3)
-        for mem_shape, mem_in, mem_out in zip(
-            model.memories[0::3], model.memories[1::3], model.memories[2::3]
-        ):
+        w.write_int32(len(model.memories)//3)
+        for mem_shape, mem_in, mem_out in zip(model.memories[0::3], model.memories[1::3], model.memories[2::3]):
            w.write_shape(mem_shape)
            w.write_str(mem_in)
            w.write_str(mem_out)
        w.write_int32(len(model.layers))
        for l in model.layers:

-            assert not l.name in l.inputs
+            assert(not l.name in l.inputs)
-            w.write_int32(0)  # dummy
-            w.write_int32(0)  # dummy
+            w.write_int32(0) #dummy
+            w.write_int32(0) #dummy
            w.write_shape(l.pads)
            w.write_shape(l.strides)
            w.write_shape(l.pool_size)
-            w.write_int32(0)  # dummy
+            w.write_int32(0) #dummy
-                assert len(x.shape) == 4
-                assert x.data.nbytes % 4 == 0
-                length = (
-                    x.data.nbytes >> 2
-                )  # length is measured in float32s (at least for now)
+                assert(len(x.shape) == 4)
+                assert(x.data.nbytes % 4 == 0)
+                length = x.data.nbytes >> 2 # length is measured in float32s (at least for now)

                w.write_str(x.name)
                w.write_shape(x.shape)

        for x in all_tensors:
            w.write_array(x.data)
+
+
+
--- a/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
+++ b/ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
 fileFormatVersion: 2
-guid: 83221ad3db87f4b3b91b041047cb2bc5
+guid: 19ed1486aa27d4903b34839f37b8f69f
 MonoImporter:
  externalObjects: {}
  serializedVersion: 2
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
+fileFormatVersion: 2
+guid: 4b10c58689ee84c2abe895327686f532
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
+fileFormatVersion: 2
+guid: e192a80b369ad4683a329432eeb5ec20
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
+{
+    "name": "Barracuda-editor",
+    "references": [],
+    "includePlatforms": [
+        "Editor"
+    ],
+    "excludePlatforms": []
+}
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
+fileFormatVersion: 2
+guid: 9f1e7d835703842dda0e25142ed6c3c9
+AssemblyDefinitionImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
+fileFormatVersion: 2
+guid: 57d823f2746e44dc79116df94518bd27
+TextureImporter:
+  fileIDToRecycleName: {}
+  externalObjects: {}
+  serializedVersion: 4
+  mipmaps:
+    mipMapMode: 0
+    enableMipMap: 0
+    sRGBTexture: 0
+    linearTexture: 0
+    fadeOut: 0
+    borderMipMap: 0
+    mipMapsPreserveCoverage: 0
+    alphaTestReferenceValue: 0.5
+    mipMapFadeDistanceStart: 1
+    mipMapFadeDistanceEnd: 3
+  bumpmap:
+    convertToNormalMap: 0
+    externalNormalMap: 0
+    heightScale: 0.25
+    normalMapFilter: 0
+  isReadable: 0
+  grayScaleToAlpha: 0
+  generateCubemap: 6
+  cubemapConvolution: 0
+  seamlessCubemap: 0
+  textureFormat: 1
+  maxTextureSize: 2048
+  textureSettings:
+    serializedVersion: 2
+    filterMode: -1
+    aniso: 1
+    mipBias: -1
+    wrapU: 1
+    wrapV: 1
+    wrapW: -1
+  nPOTScale: 0
+  lightmap: 0
+  compressionQuality: 50
+  spriteMode: 0
+  spriteExtrude: 1
+  spriteMeshType: 1
+  alignment: 0
+  spritePivot: {x: 0.5, y: 0.5}
+  spritePixelsToUnits: 100
+  spriteBorder: {x: 0, y: 0, z: 0, w: 0}
+  spriteGenerateFallbackPhysicsShape: 1
+  alphaUsage: 1
+  alphaIsTransparency: 1
+  spriteTessellationDetail: -1
+  textureType: 2
+  textureShape: 1
+  maxTextureSizeSet: 0
+  compressionQualitySet: 0
+  textureFormatSet: 0
+  platformSettings:
+  - buildTarget: DefaultTexturePlatform
+    maxTextureSize: 2048
+    resizeAlgorithm: 0
+    textureFormat: -1
+    textureCompression: 1
+    compressionQuality: 50
+    crunchedCompression: 0
+    allowsAlphaSplitting: 0
+    overridden: 0
+    androidETC2FallbackOverride: 0
+  - buildTarget: Standalone
+    maxTextureSize: 2048
+    resizeAlgorithm: 0
+    textureFormat: -1
+    textureCompression: 1
+    compressionQuality: 50
+    crunchedCompression: 0
+    allowsAlphaSplitting: 0
+    overridden: 0
+    androidETC2FallbackOverride: 0
+  - buildTarget: iPhone
+    maxTextureSize: 2048
+    resizeAlgorithm: 0
+    textureFormat: -1
+    textureCompression: 1
+    compressionQuality: 50
+    crunchedCompression: 0
+    allowsAlphaSplitting: 0
+    overridden: 0
+    androidETC2FallbackOverride: 0
+  - buildTarget: Android
+    maxTextureSize: 2048
+    resizeAlgorithm: 0
+    textureFormat: -1
+    textureCompression: 1
+    compressionQuality: 50
+    crunchedCompression: 0
+    allowsAlphaSplitting: 0
+    overridden: 0
+    androidETC2FallbackOverride: 0
+  spriteSheet:
+    serializedVersion: 2
+    sprites: []
+    outline: []
+    physicsShape: []
+  spritePackingTag: 
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
+++ b/UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
+using System.IO;
+using UnityEditor;
+using UnityEngine;
+using UnityEditor.Experimental.AssetImporters;
+
+namespace Barracuda
+{
+    /// <summary>
+    /// Asset Importer of barracuda models.
+    /// </summary>
+    [ScriptedImporter(1, new[] {"nn"})]
+    public class NNModelImporter : ScriptedImporter {
+        private const string iconName = "NNModelIcon";
+
+        private Texture2D iconTexture;
+
+        public override void OnImportAsset(AssetImportContext ctx)
+        {
+            var model = File.ReadAllBytes(ctx.assetPath);
+            var asset = ScriptableObject.CreateInstance<NNModel>();
+            asset.Value = model;
+            
+            ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
+            ctx.SetMainObject(asset);
+        }
+
+        private Texture2D LoadIconTexture()
+        {
+            if (iconTexture == null)
+            {
+                string[] allCandidates = AssetDatabase.FindAssets(iconName);
+
+                if (allCandidates.Length > 0)
+                {
+                    iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
+                }
+            }
+            return iconTexture;
+        }
+        
+    }
+}
--- a/UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
+++ b/UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
-using System.IO;
-using UnityEditor;
-using UnityEngine;
-using UnityEditor.Experimental.AssetImporters;
-using MLAgents.InferenceBrain;
-
-namespace MLAgents
-{
-    /// <summary>
-    /// Asset Importer of barracuda models.
-    /// </summary>
-    [ScriptedImporter(1, new[] {"nn"})]
-    public class NNModelImporter : ScriptedImporter {
-        private const string IconPath = "Assets/ML-Agents/Resources/NNModelIcon.png";
-
-        public override void OnImportAsset(AssetImportContext ctx)
-        {
-            var model = File.ReadAllBytes(ctx.assetPath);
-            var asset = ScriptableObject.CreateInstance<NNModel>();
-            asset.Value = model;
-            
-            Texture2D texture = (Texture2D)
-                AssetDatabase.LoadAssetAtPath(IconPath, typeof(Texture2D));
-            
-            ctx.AddObjectToAsset(ctx.assetPath, asset, texture);
-            ctx.SetMainObject(asset);
-        }
-    }
-}
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
-using UnityEngine;
-
-namespace MLAgents.InferenceBrain
-{
-    public class NNModel : ScriptableObject
-    {
-        [HideInInspector]
-        public byte[] Value;
-    }
-}
--- a/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
+++ b/UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
-fileFormatVersion: 2
-guid: fb1293e6d636b46d09ae35b36241a0c6
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a//UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
+++ b//UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta