浏览代码

First stage of ML Agents update to Barracuda 0.2.x

/develop-generalizationTraining-TrainerController
Mantas Puida 6 年前
当前提交
27567062
共有 55 个文件被更改,包括 4646 次插入3834 次删除
  1. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
  2. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
  3. 2
      UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
  4. 2
      UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
  5. 2
      UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
  6. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
  7. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
  8. 2
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
  9. 2
      UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
  10. 2
      UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
  11. 2
      UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
  12. 2
      UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
  13. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
  14. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
  15. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
  16. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
  17. 2
      UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
  18. 2
      UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
  19. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
  20. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
  21. 23
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
  22. 997
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
  23. 918
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
  24. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
  25. 68
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
  26. 566
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
  27. 632
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
  28. 438
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
  29. 30
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute
  30. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
  31. 214
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
  32. 483
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
  33. 44
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
  34. 480
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc
  35. 112
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute
  36. 23
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
  37. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
  38. 2
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
  39. 7
      UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
  40. 354
      ml-agents/mlagents/trainers/barracuda.py
  41. 914
      ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
  42. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
  43. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
  44. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
  45. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
  46. 7
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
  47. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
  48. 106
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
  49. 42
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
  50. 29
      UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
  51. 10
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
  52. 11
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
  53. 0
      /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.10980392, g: 0.6039216, b: 1, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.5, g: 0.5, b: 0.5, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

23
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md


### Load Model into Barracuda
Once you have your TensorFlow (or ONNX) model converted, you can load resulting Barracuda file via `ModelLoader`:
```C#
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".bytes");
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".nn");
```
Another option is to use editor model importer. Just add public `NNModel` field to your C# script and assing ``.nn`` model file via editor UI:
```C#
public NNModel modelSource;
<..>
var model = ModelLoader.Load(modelSource);
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputeFast, model)
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputePrecompiled, model)
```
### Execute the model

Execution is asynchronous for GPU backends. Currently implementation is synchronous for CPU backends, however it is good to assume that execution will be async for all backends in the future.
### Fetch outputs
If model has only single output, then simple `worker.Fetch()` can be used, otherwise output names should be provided.
If model has only single output, then simple `worker.Peek()` can be used, otherwise output names should be provided.
var O = worker.Fetch(outputName);
var O = worker.Peek(outputName);
_Note:_ ``Peek()`` does not take ownership of the tensor. If you expect to keep tensor for longer time use ``Fetch()``
### Cleanup
As a Barracuda client you are responsible to `Dispose` _worker_, _inputs_ and _outputs_ you fetched. This is necessary to properly free GPU resources.

### Texture as output
If you want to use Barracuda execution results further in the graphics pipeline, you can copy data from `Tensor` into `RenderTexture` without stalling CPU or GPU:
```C#
var tensor = worker.Fetch();
var tensor = worker.Peek();
var texture = BarracudaTextureUtils.TensorToRenderTexture(tensor);
```
If you wish, you can reuse the same `RenderTexture` multiple times:

var tensor = worker.Fetch();
var tensor = worker.Peek();
BarracudaTextureUtils.TensorToRenderTexture(tensor, texture);
```

Convert from TensorFlow:
```bash
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.bytes
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.nn
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.bytes
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.nn
```
If network has multiple outputs, but you need only particular ones during the inference, there is an optional `-trim` flag to remove unused outputs and calculations.

997
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
文件差异内容过多而无法显示
查看文件

918
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute


#pragma kernel Relu_Flat
#pragma kernel Relu_Loop
#pragma kernel Relu6_Flat
#pragma kernel Relu6_Loop
#pragma kernel Tanh_Flat
#pragma kernel Tanh_Loop
#pragma kernel Swish_Flat
#pragma kernel Swish_Loop
#pragma kernel Sigmoid_Flat
#pragma kernel Sigmoid_Loop
#pragma kernel Elu_Flat
#pragma kernel Elu_Loop
#pragma kernel LeakyRelu_Flat
#pragma kernel LeakyRelu_Loop
#pragma kernel Exp_Flat
#pragma kernel Exp_Loop
#pragma kernel Log_Flat
#pragma kernel Log_Loop
#pragma kernel Pow_Flat
#pragma kernel Pow_Loop
/*
Relu_Flat (NEW) vs Relu_Nyxc+Relu_CNyx+Relu
Compute Precompiled
VGG@1
<<<Exec #128: 59.6 ms, cpu: .9 ms, avg: 62.4 ms, result:OK <--- NEW!
<<<Exec #128: 63.6 ms, cpu: .9 ms, avg: 64.0 ms, result:OK
VGG@4
<<<Exec #16: 276.7 ms, cpu: .9 ms, avg: 272.8 ms, result:OK <--- NEW!
<<<Exec #16: 297.5 ms, cpu: .9 ms, avg: 274.4 ms, result:OK
RES@1
<<<Exec #100: 82.2 ms, cpu: 22.2 ms, avg: 81.0 ms, result:OK <--- NEW!
<<<Exec #100: 82.1 ms, cpu: 22.5 ms, avg: 85.4 ms, result:OK
PPO_2@256
<<<Exec #200: 10.3 ms, cpu: 7.6 ms, avg: 11.9 ms, result:OK <--- NEW!
<<<Exec #200: 10.9 ms, cpu: 8.3 ms, avg: 12.3 ms, result:OK
PPO_CNN@256
<<<Exec #100: 60.6 ms, cpu: 62.3 ms, avg: 65.6 ms, result:OK <--- NEW!
<<<Exec #100: 72.6 ms, cpu: 62.7 ms, avg: 66.0 ms, result:OK
*/
#pragma kernel Relu
#pragma kernel Relu_CNyx
#pragma kernel Relu_Nyxc

#pragma kernel Exp
#pragma kernel Exp_CNyx
#pragma kernel Exp_Nyxc
#pragma kernel Log
#pragma kernel Log_CNyx
#pragma kernel Log_Nyxc
#pragma kernel Pow
#pragma kernel Pow_CNyx
#pragma kernel Pow_Nyxc

TENSOR_DECL_RW(O)
float _Alpha;
uint _LoopStride;
#define FLAT_ACTIVATION(name, op_name) \
void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
if (i > O.GetLength()) return;\
\
float v = X.Get(i);\
v = op_name (v);\
O.Set(i, v);\
}
#define LOOP_ACTIVATION(name, op_name) \
void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
uint len = O.GetLength();\
\
while (i < len) {\
float v = X.Get(i); \
v = op_name (v); \
O.Set(i, v); \
i += _LoopStride; \
}\
}
#define ACTIVATION(name, op_name) \
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
FLAT_ACTIVATION(name, op_name)\
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
LOOP_ACTIVATION(name, op_name)
return 0.5f * (v + abs(v));
return 0.5f * (v + abs(v));
return min(max(0, v), 6);
return min(max(0, v), 6);
return v / (1.f + exp(-v));
return v / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
return max(v, _Alpha * v);
return max(v, _Alpha * v);
float signed_pow(float f, float e)
float signed_pow(float f)
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
float e = _Alpha;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
ACTIVATION(Relu, relu)
ACTIVATION(Relu6, relu6)
ACTIVATION(Tanh, tanh)
ACTIVATION(Sigmoid, sigmoid)
ACTIVATION(Swish, swish)
ACTIVATION(Elu, elu)
ACTIVATION(LeakyRelu, lrelu)
ACTIVATION(Exp, exp)
ACTIVATION(Log, log)
ACTIVATION(Pow, signed_pow)
// -------------------
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void Log(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((16,16,1), (16,8,1), (16,4,1))
void Log_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))
void Log_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}

DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
文件差异内容过多而无法显示
查看文件

68
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute


NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastAdd(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastSub(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMul(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastDiv(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

float signed_pow(float f, float e)
{
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMin(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMax(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

566
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute


#pragma kernel Conv2D
#pragma kernel Conv2D_RegisterBlock4x2
//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel DepthwiseConv2D

NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
}

void Conv2D_RegisterBlock4x2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
}
#undef SIZE_W
#undef SIZE_H

[numthreads(L1CACHESIZE, 1, 1)]
void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE && (c + dc) < K.GetKernelDepth(); ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
#undef X_
#undef X_
}

DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
O.Set(n, y, x, k, acc);
}
}
}

{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
}
#undef L1CACHESIZE

[numthreads(L1CACHESIZE, 1, 1)]
void Conv2DTrans_L1Cached64_RegisterBlock2x2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
#define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= X.width) return;
if (y*SIZE >= X.height) return;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= X.width) return;
if (y*SIZE >= X.height) return;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 pad = _Pad.xy / _Stride.xy;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
GroupMemoryBarrierWithGroupSync();
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
//uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma( X_[q][dc],
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c + dc, k),
acc[q]);
//kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
//uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma( X_[q][dc],
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c + dc, k),
acc[q]);
//kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
{
uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
if (ox < O.width && oy < O.height)
O.Set(n, oy, ox, k, acc[q]);
}
}
}
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
{
uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
if (ox < O.width && oy < O.height)
O.Set(n, oy, ox, k, acc[q]);
}
}
}
}
#undef X_
#undef X_
}

632
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute


NUMTHREADS((16,8,1), (16,8,1), (16,4,1))
void Conv2D_Kmod16_Nmod8_KNY(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.channels, O.batch, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.channels, O.batch, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint n = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint n = dispatchThreadID.y;
uint y = dispatchThreadID.z;
for (uint x = 0; x < O.width; ++x)
{
float v = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
for (uint x = 0; x < O.width; ++x)
{
float v = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
for (uint c = 0; c < X.channels; ++c)
{
v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, v);
}
for (uint c = 0; c < X.channels; ++c)
{
v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, v);
}
}
#undef CTILE

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod32_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_Xcache
#define K_ Conv_Kcache
#define X_ Conv_Xcache
#define K_ Conv_Kcache
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint width = O.width;
uint height = O.height;
uint width = O.width;
uint height = O.height;
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
float x0 = 0;
float x1 = 0;
float x2 = 0;
float x3 = 0;
if (mask)
{
x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
}
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
float x0 = 0;
float x1 = 0;
float x2 = 0;
float x3 = 0;
if (mask)
{
x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
}
float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
//X_[gy][gx] = float4(x0, x1,
// x2, x3);
//K_[gy][gx] = float4(k0, k1,
// k2, k3);
X_[0][gy][gx] = x0;
X_[1][gy][gx] = x1;
X_[2][gy][gx] = x2;
X_[3][gy][gx] = x3;
//X_[gy][gx] = float4(x0, x1,
// x2, x3);
//K_[gy][gx] = float4(k0, k1,
// k2, k3);
X_[0][gy][gx] = x0;
X_[1][gy][gx] = x1;
X_[2][gy][gx] = x2;
X_[3][gy][gx] = x3;
K_[0][gy][gx] = k0;
K_[1][gy][gx] = k1;
K_[2][gy][gx] = k2;
K_[3][gy][gx] = k3;
K_[0][gy][gx] = k0;
K_[1][gy][gx] = k1;
K_[2][gy][gx] = k2;
K_[3][gy][gx] = k3;
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x = //X_[gy][i];
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k = //K_[i][gx];
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x = //X_[gy][i];
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k = //K_[i][gx];
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
//v.x += k.x*x.x + k.z*x.y;
//v.y += k.y*x.x + k.w*x.y;
//v.z += k.x*x.z + k.z*x.w;
//v.w += k.y*x.z + k.w*x.w;
}
//v.x += k.x*x.x + k.z*x.y;
//v.y += k.y*x.x + k.w*x.y;
//v.z += k.x*x.z + k.z*x.w;
//v.w += k.y*x.z + k.w*x.w;
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
O.Set(n*2+0, y, x, k*2+0, v.x);
O.Set(n*2+0, y, x, k*2+1, v.y);
O.Set(n*2+1, y, x, k*2+0, v.z);
O.Set(n*2+1, y, x, k*2+1, v.w);
#undef X_
#undef K_
O.Set(n*2+0, y, x, k*2+0, v.x);
O.Set(n*2+0, y, x, k*2+1, v.y);
O.Set(n*2+1, y, x, k*2+0, v.z);
O.Set(n*2+1, y, x, k*2+1, v.w);
#undef X_
#undef K_
}
#undef CTILE

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod32_KNyxDiv2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_Xcache2
#define K_ Conv_Kcache2
#define X_ Conv_Xcache2
#define K_ Conv_Kcache2
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint width = O.width / 2;
uint height = O.height;
uint width = O.width / 2;
uint height = O.height;
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
bool mask = n < O.batch;
bool mask = n < O.batch;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
bool maskY = mask;
uint oy = y * _Stride.y + dy;
if (oy < _Pad.y) maskY = false;
if (oy - _Pad.w >= X.height) maskY = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
bool maskY = mask;
uint oy = y * _Stride.y + dy;
if (oy < _Pad.y) maskY = false;
if (oy - _Pad.w >= X.height) maskY = false;
bool maskL = maskY;
uint oxL = (x*2+0) * _Stride.x + dx;
if (oxL < _Pad.x) maskL = false;
if (oxL - _Pad.z >= X.width) maskL = false;
bool maskL = maskY;
uint oxL = (x*2+0) * _Stride.x + dx;
if (oxL < _Pad.x) maskL = false;
if (oxL - _Pad.z >= X.width) maskL = false;
bool maskR = maskY;
uint oxR = (x*2+1) * _Stride.x + dx;
if (oxR < _Pad.x) maskR = false;
if (oxR - _Pad.z >= X.width) maskR = false;
bool maskR = maskY;
uint oxR = (x*2+1) * _Stride.x + dx;
if (oxR < _Pad.x) maskR = false;
if (oxR - _Pad.z >= X.width) maskR = false;
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
if (maskL)
{
X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[0][gy][gx] = X_[1][gy][gx] = 0;
}
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
if (maskL)
{
X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[0][gy][gx] = X_[1][gy][gx] = 0;
}
if (maskR)
{
X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[2][gy][gx] = X_[3][gy][gx] = 0;
}
if (maskR)
{
X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[2][gy][gx] = X_[3][gy][gx] = 0;
}
K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x =
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k =
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
}
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x =
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k =
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
O.Set(n, y, x*2+0, k*2+0, v.x);
O.Set(n, y, x*2+0, k*2+1, v.y);
if (mask && x*2+1 < O.width)
{
O.Set(n, y, x*2+1, k*2+0, v.z);
O.Set(n, y, x*2+1, k*2+1, v.w);
}
O.Set(n, y, x*2+0, k*2+0, v.x);
O.Set(n, y, x*2+0, k*2+1, v.y);
if (mask && x*2+1 < O.width)
{
O.Set(n, y, x*2+1, k*2+0, v.z);
O.Set(n, y, x*2+1, k*2+1, v.w);
}
#undef X_
#undef K_
#undef X_
#undef K_
}

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint x = nyx % O.width;
uint ny = nyx / O.width;
uint y = ny % O.height;
uint n = ny / O.height;
uint x = nyx % O.width;
uint ny = nyx / O.width;
uint y = ny % O.height;
uint n = ny / O.height;
float v[RTILE][RTILE];
for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
{
float b = B.Get(k*RTILE+xxxx);
for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
v[yyyy][xxxx] = b;
}
float v[RTILE][RTILE];
for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
{
float b = B.Get(k*RTILE+xxxx);
for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
v[yyyy][xxxx] = b;
}
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
{
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
{
if (mask)
X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
else
X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
}
for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
{
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
{
if (mask)
X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
else
X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint ii = 0; ii < CTILE; ++ii)
{
float x[RTILE][RTILE];
float k[RTILE][RTILE];
for (uint ii = 0; ii < CTILE; ++ii)
{
float x[RTILE][RTILE];
float k[RTILE][RTILE];
[unroll]
for (uint yy = 0; yy < RTILE; ++yy)
{
[unroll]
for (uint xx = 0; xx < RTILE; ++xx)
{
x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
}
}
[unroll]
for (uint yy = 0; yy < RTILE; ++yy)
{
[unroll]
for (uint xx = 0; xx < RTILE; ++xx)
{
x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
}
}
[unroll]
for (uint yyy = 0; yyy < RTILE; ++yyy)
{
[unroll]
for (uint xxx = 0; xxx < RTILE; ++xxx)
{
[unroll]
for (uint i = 0; i < RTILE; ++i)
{
v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
}
}
}
}
[unroll]
for (uint yyy = 0; yyy < RTILE; ++yyy)
{
[unroll]
for (uint xxx = 0; xxx < RTILE; ++xxx)
{
[unroll]
for (uint i = 0; i < RTILE; ++i)
{
v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
}
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
#undef X_
#undef K_
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
#undef X_
#undef K_
}

438
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute


#pragma kernel Dense_L1Cached64
#pragma kernel DenseTiled16x16
//#pragma kernel DenseTiled32x32
//#pragma kernel DenseTiled64x64
#pragma kernel DenseTiled32x32
#pragma kernel DenseTiled64x64
#include "Tensor.cginc"

[numthreads(CACHESIZE, 1, 1)]
void Dense_L1Cached64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ Dense_L1Cached64_X
#define X_ Dense_L1Cached64_X
uint x = CACHESIZE * groupID.x + groupThreadID.x;
uint y = groupID.y;
uint x = CACHESIZE * groupID.x + groupThreadID.x;
uint y = groupID.y;
uint wIndex = W.Index(0, x);
uint wIndex = W.Index(0, x);
float acc = B.Get(x);
// loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
{
// Cache X
// coalescent reads
X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
GroupMemoryBarrierWithGroupSync();
float acc = B.Get(x);
// loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
{
// Cache X
// coalescent reads
X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
GroupMemoryBarrierWithGroupSync();
// X * W
if (i + CACHESIZE <= X.GetFlatWidth())
{
[unroll]
for (uint di = 0; di < CACHESIZE; ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
else
{
// handle remainder of the line < CACHESIZE
for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
// X * W
if (i + CACHESIZE <= X.GetFlatWidth())
{
[unroll]
for (uint di = 0; di < CACHESIZE; ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
else
{
// handle remainder of the line < CACHESIZE
for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
GroupMemoryBarrierWithGroupSync();
}
GroupMemoryBarrierWithGroupSync();
}
// needed all threads to load matrix line, x might be out of the bounds for writing
if (x < O.GetFlatWidth())
O.Set(y, x, acc);
// needed all threads to load matrix line, x might be out of the bounds for writing
if (x < O.GetFlatWidth())
O.Set(y, x, acc);
#undef X_
#undef X_
}

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache
#define W_ DenseTiled_Wcache
#define X_ DenseTiled_Xcache
#define W_ DenseTiled_Wcache
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
float v = B.Get(x);
for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
{
if (mask)
{
X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
}
else
{
X_[ty][tx] = 0;
W_[ty][tx] = 0;
}
float v = B.Get(x);
for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
{
if (mask)
{
X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
}
else
{
X_[ty][tx] = 0;
W_[ty][tx] = 0;
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
v = fastfma(X_[ty][i], W_[i][tx], v);
}
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
v = fastfma(X_[ty][i], W_[i][tx], v);
}
GroupMemoryBarrierWithGroupSync();
}
if (mask)
O.Set(y, x, v);
GroupMemoryBarrierWithGroupSync();
}
if (mask)
O.Set(y, x, v);
#undef X_
#undef W_
#undef X_
#undef W_
}
#undef TILE_WIDTH

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled32x32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache32
#define W_ DenseTiled_Wcache32
#define X_ DenseTiled_Xcache32
#define W_ DenseTiled_Wcache32
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
float b0 = B.Get(x*2+0);
float b1 = B.Get(x*2+1);
float4 v = float4(b0, b1,
b0, b1);
float b0 = B.Get(x*2+0);
float b1 = B.Get(x*2+1);
float4 v = float4(b0, b1,
b0, b1);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
{
float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
{
float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
++m;
++m;
X_[0][ty][tx] = x0;
X_[1][ty][tx] = x1;
X_[2][ty][tx] = x2;
X_[3][ty][tx] = x3;
X_[0][ty][tx] = x0;
X_[1][ty][tx] = x1;
X_[2][ty][tx] = x2;
X_[3][ty][tx] = x3;
W_[0][ty][tx] = w0;
W_[1][ty][tx] = w1;
W_[2][ty][tx] = w2;
W_[3][ty][tx] = w3;
W_[0][ty][tx] = w0;
W_[1][ty][tx] = w1;
W_[2][ty][tx] = w2;
W_[3][ty][tx] = w3;
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
float4 x =
float4( X_[0][ty][i],
X_[1][ty][i],
X_[2][ty][i],
X_[3][ty][i]);
float4 w =
float4( W_[0][i][tx],
W_[1][i][tx],
W_[2][i][tx],
W_[3][i][tx]);
v.x = fastfma(w.x, x.x, v.x);
v.y = fastfma(w.y, x.x, v.y);
v.z = fastfma(w.x, x.z, v.z);
v.w = fastfma(w.y, x.z, v.w);
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
float4 x =
float4( X_[0][ty][i],
X_[1][ty][i],
X_[2][ty][i],
X_[3][ty][i]);
float4 w =
float4( W_[0][i][tx],
W_[1][i][tx],
W_[2][i][tx],
W_[3][i][tx]);
v.x = fastfma(w.x, x.x, v.x);
v.y = fastfma(w.y, x.x, v.y);
v.z = fastfma(w.x, x.z, v.z);
v.w = fastfma(w.y, x.z, v.w);
v.x = fastfma(w.z, x.y, v.x);
v.y = fastfma(w.w, x.y, v.y);
v.z = fastfma(w.z, x.w, v.z);
v.w = fastfma(w.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
O.Set(y*2+0, x*2+0, v.x);
O.Set(y*2+0, x*2+1, v.y);
O.Set(y*2+1, x*2+0, v.z);
O.Set(y*2+1, x*2+1, v.w);
v.x = fastfma(w.z, x.y, v.x);
v.y = fastfma(w.w, x.y, v.y);
v.z = fastfma(w.z, x.w, v.z);
v.w = fastfma(w.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
O.Set(y*2+0, x*2+0, v.x);
O.Set(y*2+0, x*2+1, v.y);
O.Set(y*2+1, x*2+0, v.z);
O.Set(y*2+1, x*2+1, v.w);
#undef X_
#undef W_
#undef X_
#undef W_
}
#undef TILE_WIDTH

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache64
#define W_ DenseTiled_Wcache64
#define X_ DenseTiled_Xcache64
#define W_ DenseTiled_Wcache64
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
float b0 = B.Get(x*4+0);
float b1 = B.Get(x*4+1);
float b2 = B.Get(x*4+2);
float b3 = B.Get(x*4+3);
float4 v0, v1, v2, v3;
v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
float b0 = B.Get(x*4+0);
float b1 = B.Get(x*4+1);
float b2 = B.Get(x*4+2);
float b3 = B.Get(x*4+3);
float4 v0, v1, v2, v3;
v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m)
{
for (uint yy = 0; yy < 4; ++yy)
for (uint xx = 0; xx < 4; ++xx)
{
X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
}
GroupMemoryBarrierWithGroupSync();
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m)
{
for (uint yy = 0; yy < 4; ++yy)
for (uint xx = 0; xx < 4; ++xx)
{
X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
}
GroupMemoryBarrierWithGroupSync();
for (uint i = 0; i < TILE_WIDTH; ++i)
{
[unroll]
for (uint q = 0; q < 4; ++q)
{
float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
for (uint i = 0; i < TILE_WIDTH; ++i)
{
[unroll]
for (uint q = 0; q < 4; ++q)
{
float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
v0.x = fastfma(x0, w0, v0.x); //--
v1.x = fastfma(x1, w0, v1.x);
v2.x = fastfma(x2, w0, v2.x);
v3.x = fastfma(x3, w0, v3.x);
v0.y = fastfma(x0, w1, v0.y); //--
v1.y = fastfma(x1, w1, v1.y);
v2.y = fastfma(x2, w1, v2.y);
v3.y = fastfma(x3, w1, v3.y);
v0.z = fastfma(x0, w2, v0.z); //--
v1.z = fastfma(x1, w2, v1.z);
v2.z = fastfma(x2, w2, v2.z);
v3.z = fastfma(x3, w2, v3.z);
v0.w = fastfma(x0, w3, v0.w); //--
v1.w = fastfma(x1, w3, v1.w);
v2.w = fastfma(x2, w3, v2.w);
v3.w = fastfma(x3, w3, v3.w);
}
v0.x = fastfma(x0, w0, v0.x); //--
v1.x = fastfma(x1, w0, v1.x);
v2.x = fastfma(x2, w0, v2.x);
v3.x = fastfma(x3, w0, v3.x);
v0.y = fastfma(x0, w1, v0.y); //--
v1.y = fastfma(x1, w1, v1.y);
v2.y = fastfma(x2, w1, v2.y);
v3.y = fastfma(x3, w1, v3.y);
v0.z = fastfma(x0, w2, v0.z); //--
v1.z = fastfma(x1, w2, v1.z);
v2.z = fastfma(x2, w2, v2.z);
v3.z = fastfma(x3, w2, v3.z);
v0.w = fastfma(x0, w3, v0.w); //--
v1.w = fastfma(x1, w3, v1.w);
v2.w = fastfma(x2, w3, v2.w);
v3.w = fastfma(x3, w3, v3.w);
}
GroupMemoryBarrierWithGroupSync();
}
}
GroupMemoryBarrierWithGroupSync();
}
}
O.Set(y*4+0, x*4+0, v0.x);
O.Set(y*4+0, x*4+1, v0.y);
O.Set(y*4+0, x*4+2, v0.z);
O.Set(y*4+0, x*4+3, v0.w);
O.Set(y*4+0, x*4+0, v0.x);
O.Set(y*4+0, x*4+1, v0.y);
O.Set(y*4+0, x*4+2, v0.z);
O.Set(y*4+0, x*4+3, v0.w);
O.Set(y*4+1, x*4+0, v1.x);
O.Set(y*4+1, x*4+1, v1.y);
O.Set(y*4+1, x*4+2, v1.z);
O.Set(y*4+1, x*4+3, v1.w);
O.Set(y*4+1, x*4+0, v1.x);
O.Set(y*4+1, x*4+1, v1.y);
O.Set(y*4+1, x*4+2, v1.z);
O.Set(y*4+1, x*4+3, v1.w);
O.Set(y*4+2, x*4+0, v2.x);
O.Set(y*4+2, x*4+1, v2.y);
O.Set(y*4+2, x*4+2, v2.z);
O.Set(y*4+2, x*4+3, v2.w);
O.Set(y*4+2, x*4+0, v2.x);
O.Set(y*4+2, x*4+1, v2.y);
O.Set(y*4+2, x*4+2, v2.z);
O.Set(y*4+2, x*4+3, v2.w);
O.Set(y*4+3, x*4+0, v3.x);
O.Set(y*4+3, x*4+1, v3.y);
O.Set(y*4+3, x*4+2, v3.z);
O.Set(y*4+3, x*4+3, v3.w);
#undef X_
#undef W_
O.Set(y*4+3, x*4+0, v3.x);
O.Set(y*4+3, x*4+1, v3.y);
O.Set(y*4+3, x*4+2, v3.z);
O.Set(y*4+3, x*4+3, v3.w);
#undef X_
#undef W_
}

30
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute


float2 Unpack(SharedTensor t, uint y, uint x)
{
uint v = asuint(t.data[t.Index(y, x) >> 1]);
// TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
// using custom conversion function for now
//return float2(f16tof32(v), f16tof32(v>>16));
return float2(f16tof32_(v), f16tof32_(v>>16));
uint v = asuint(t.data[t.Index(y, x) >> 1]);
// TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
// using custom conversion function for now
//return float2(f16tof32(v), f16tof32(v>>16));
return float2(f16tof32_(v), f16tof32_(v>>16));
}
// NOTE: usually this path is used for <16 batches

DISPATCH_ARGS(O.flatWidth/2, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
float2 acc = Unpack(B, 0, x*2);
for (uint i = 0; i < X.width; ++i)
{
float2 w = Unpack(W, i, x*2);
acc += X.Get(y, i) * w;
}
float2 acc = Unpack(B, 0, x*2);
for (uint i = 0; i < X.width; ++i)
{
float2 w = Unpack(W, i, x*2);
acc += X.Get(y, i) * w;
}
O.Set(y, x*2+0, acc[0]);
O.Set(y, x*2+1, acc[1]);
O.Set(y, x*2+0, acc[0]);
O.Set(y, x*2+1, acc[1]);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
文件差异内容过多而无法显示
查看文件

214
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute


[numthreads(THREAD_COUNT, 1, 1)]
void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_XcacheR
#define W_ DenseTiled_WcacheR
#define X_ DenseTiled_XcacheR
#define W_ DenseTiled_WcacheR
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
float v[BLOCK_WIDTH][BLOCK_WIDTH];
for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
float v[BLOCK_WIDTH][BLOCK_WIDTH];
for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
}
GroupMemoryBarrierWithGroupSync();
}
GroupMemoryBarrierWithGroupSync();
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
#undef X_
#undef W_
#undef X_
#undef W_
}

[numthreads(THREAD_COUNT, 1, 1)]
void Conv2D_Kernel3x3_64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint width = O.width;
uint height = O.height;
uint width = O.width;
uint height = O.height;
// ASSERT(LOAD_WIDTH == THREAD_COUNT)
uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
uint loadX = loadNYX % width;
uint loadNY = loadNYX / width;
uint loadY = loadNY % height;
uint loadN = loadNY / height;
// ASSERT(LOAD_WIDTH == THREAD_COUNT)
uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
uint loadX = loadNYX % width;
uint loadNY = loadNYX / width;
uint loadY = loadNY % height;
uint loadN = loadNY / height;
// @TODO: validate that _Stride works, added the following 2 lines without testing
loadX *= _Stride.x;
loadY *= _Stride.y;
// @TODO: validate that _Stride works, added the following 2 lines without testing
loadX *= _Stride.x;
loadY *= _Stride.y;
float v[BLOCK_WIDTH][BLOCK_WIDTH];
[unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
[unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
float v[BLOCK_WIDTH][BLOCK_WIDTH];
[unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
[unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
for (uint dy = 0; dy < 3; ++dy)
{
bool mask = true;
for (uint dy = 0; dy < 3; ++dy)
{
bool mask = true;
if (loadY+dy < _Pad.y) mask = false;
if (loadY+dy - _Pad.w >= X.height) mask = false;
if (loadY+dy < _Pad.y) mask = false;
if (loadY+dy - _Pad.w >= X.height) mask = false;
for (uint dx = 0; dx < 3; ++dx)
{
if (loadX+dx < _Pad.x) mask = false;
if (loadX+dx - _Pad.z >= X.width) mask = false;
for (uint dx = 0; dx < 3; ++dx)
{
if (loadX+dx < _Pad.x) mask = false;
if (loadX+dx - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
if (mask)
X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
else
X_[q][id] = 0;
K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
if (mask)
X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
else
X_[q][id] = 0;
K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
[unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
{
uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
uint saveX = saveNYX % width;
uint saveNY = saveNYX / width;
uint saveY = saveNY % height;
uint saveN = saveNY / height;
[unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
{
uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
uint saveX = saveNYX % width;
uint saveNY = saveNYX / width;
uint saveY = saveNY % height;
uint saveN = saveNY / height;
uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
}
uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
}
#undef X_
#undef K_
#undef X_
#undef K_
}

483
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute


#pragma kernel ScaleBias
#pragma kernel ScaleBias
#pragma kernel ScaleBias_CNyx2
#pragma kernel ScaleBias_Flat
#pragma kernel Upsample2D
#pragma kernel AvgPool2D
#pragma kernel MaxPool2D

#pragma kernel InstanceNorm
#pragma kernel Copy
/*
ScaleBias_Flat+ScaleBias_CNyx2 (NEW) vs ScaleBias+ScaleBias_CNyx
Compute Precompiled
MOBILENET@4
<<<Exec #64: 66.5 ms, cpu: 7.7 ms, avg: 66.3 ms, result:OK <--- NEW!
<<<Exec #64: 66.7 ms, cpu: 8.0 ms, avg: 67.1 ms, result:OK
*/
#include "Tensor.cginc"
TENSOR_DECL(X)

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void ScaleBias(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
NUMTHREADS((256,1,1), (128,1,1), (64,1,1))
void ScaleBias_Flat(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.length, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
if (c >= X.channels) return;
if (n >= X.batch) return;
uint i = dispatchThreadID.x;
if (i > O.GetLength()) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
uint c = i % X.channels;
float bias = B.Get(c);
float scale = W.Get(c);
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
float v = X.Get(i);
v = v * scale + bias;
O.Set(i, v);
}
NUMTHREADS((32,4,1), (32,2,1), (16,2,1))
void ScaleBias_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint i = dispatchThreadID.y * X.channels + c;
if (c >= X.channels) return;
if (i >= X.GetLength()) return;
float bias = B.Get(c);
float scale = W.Get(c);
float v = X.Get(i);
v = v * scale + bias;
O.Set(i, v);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= X.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
if (c >= X.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float v = X.Get(n, y, x, c);
for (uint n = 0; n < O.batch; ++n)
{
float v = X.Get(n, y, x, c);
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Pool.y + dy;
uint ox = x * _Pool.x + dx;
O.Set(n, oy, ox, c, v);
}
}
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Pool.y + dy;
uint ox = x * _Pool.x + dx;
O.Set(n, oy, ox, c, v);
}
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float acc = 0;
float counter = 0;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < X.batch; ++n)
{
float acc = 0;
float counter = 0;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
counter += (mask)? 1: 0;
}
acc /= counter;
O.Set(n, y, x, c, acc);
}
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
counter += (mask)? 1: 0;
}
acc /= counter;
O.Set(n, y, x, c, acc);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
{
float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
{
float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
O.Set(n, y, x, c, v);
}
O.Set(n, y, x, c, v);
}
}
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))

DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v0 = X.Get(n, y*2, x*2, c);
float v1 = X.Get(n, y*2+1, x*2, c);
float v2 = X.Get(n, y*2, x*2+1, c);
float v3 = X.Get(n, y*2+1, x*2+1, c);
float v = max(v0, max(v1, max(v2, v3)));
for (uint n = 0; n < X.batch; ++n)
{
float v0 = X.Get(n, y*2, x*2, c);
float v1 = X.Get(n, y*2+1, x*2, c);
float v2 = X.Get(n, y*2, x*2+1, c);
float v3 = X.Get(n, y*2+1, x*2+1, c);
float v = max(v0, max(v1, max(v2, v3)));
O.Set(n, y, x, c, v);
}
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.batch == O.batch)
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.batch == O.batch)
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint y = 0; y < X.height; ++y)
for (uint x = 0; x < X.width; ++x)
v += X.Get(n, y, x, c);
v /= (X.height * X.width);
O.Set(n, 0, 0, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint y = 0; y < X.height; ++y)
for (uint x = 0; x < X.width; ++x)
v += X.Get(n, y, x, c);
v /= (X.height * X.width);
O.Set(n, 0, 0, c, v);
}
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.shape == O.shape)
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.shape == O.shape)
float gamma = W.Get(0, 0, 0, c);
float beta = B.Get(0, 0, 0, c);
float gamma = W.Get(0, 0, 0, c);
float beta = B.Get(0, 0, 0, c);
for (uint n = 0; n < O.batch; ++n)
{
uint x, y;
// calc mean
float acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
acc += X.Get(n, y, x, c);
float mean = acc / (O.width * O.height);
for (uint n = 0; n < O.batch; ++n)
{
uint x, y;
// calc mean
float acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
acc += X.Get(n, y, x, c);
float mean = acc / (O.width * O.height);
// calc variance
acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float delta = X.Get(n, y, x, c) - mean;
acc += delta * delta;
}
float var = acc / (O.width * O.height);
// calc variance
acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float delta = X.Get(n, y, x, c) - mean;
acc += delta * delta;
}
float var = acc / (O.width * O.height);
// normalization factor
float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
// normalization factor
float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
float scale = gamma * invNormFactor;
float bias = beta - gamma * mean * invNormFactor;
float scale = gamma * invNormFactor;
float bias = beta - gamma * mean * invNormFactor;
// apply normalization
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
}
// apply normalization
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
}
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
}
}

44
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc


// Copyright: Copyleft 2012 :-)
float RandomUsingCos(float4 seed)
{
float4 K1 = float4( // Transcendental numbers:
0.64341054629, // (Cahen's constant)
23.14069263277926, // e^pi (Gelfond's constant)
2.665144142690225, // 2^sqrt(2) (Gelfond-Schneider constant)
3.14159265359 // pi
);
return frac(cos(dot(seed, K1)) * 12345.6789);
float4 K1 = float4( // Transcendental numbers:
0.64341054629, // (Cahen's constant)
23.14069263277926, // e^pi (Gelfond's constant)
2.665144142690225, // 2^sqrt(2) (Gelfond-Schneider constant)
3.14159265359 // pi
);
return frac(cos(dot(seed, K1)) * 12345.6789);
}
// Based on: https://stackoverflow.com/questions/4200224/random-noise-functions-for-glsl

// A single iteration of Bob Jenkins' One-At-A-Time hashing algorithm.
uint hash(uint x)
{
x += ( x << 10u );
x ^= ( x >> 6u );
x += ( x << 3u );
x ^= ( x >> 11u );
x += ( x << 15u );
return x;
x += ( x << 10u );
x ^= ( x >> 6u );
x += ( x << 3u );
x ^= ( x >> 11u );
x += ( x << 15u );
return x;
}
uint hash( uint2 v ) { return hash( v.x ^ hash(v.y) ); }
uint hash( uint3 v ) { return hash( v.x ^ hash(v.y) ^ hash(v.z) ); }

// All zeroes yields 0.0, all ones yields the next smallest representable value below 1.0.
float floatConstruct(uint m)
{
const uint ieeeMantissa = 0x007FFFFFu; // binary32 mantissa bitmask
const uint ieeeOne = 0x3F800000u; // 1.0 in IEEE binary32
const uint ieeeMantissa = 0x007FFFFFu; // binary32 mantissa bitmask
const uint ieeeOne = 0x3F800000u; // 1.0 in IEEE binary32
m &= ieeeMantissa; // Keep only mantissa bits (fractional part)
m |= ieeeOne; // Add fractional part to 1.0
m &= ieeeMantissa; // Keep only mantissa bits (fractional part)
m |= ieeeOne; // Add fractional part to 1.0
float f = asfloat(m); // Range [1:2]
return f - 1.0; // Range [0:1]
float f = asfloat(m); // Range [1:2]
return f - 1.0; // Range [0:1]
return floatConstruct(hash(asuint(seed)));
return floatConstruct(hash(asuint(seed)));
}

float Random(float4 seed)
{
return RandomUsingCos(seed);
return RandomUsingCos(seed);
return Random(seed) <= p ? 1: 0;
return Random(seed) <= p ? 1: 0;
}

480
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc


struct Tensor
{
// @TODO: actually uint seems not like a good idea anymore, consider going to int
uint batch, height, width, channels;
// @TODO: actually uint seems not like a good idea anymore, consider going to int
uint batch, height, width, channels;
void Init(uint4 nhwc)
{
batch = nhwc.x;
height = nhwc.y;
width = nhwc.z;
channels = nhwc.w;
}
void Init(uint4 nhwc)
{
batch = nhwc.x;
height = nhwc.y;
width = nhwc.z;
channels = nhwc.w;
}
uint4 Dims()
{
return uint4(batch, height, width, channels);
}
uint GetFlatHeight()
{
return batch;
}
uint GetFlatWidth()
{
return height * width * channels;
}
uint GetKernelHeight()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelHeight = batch;
return kernelHeight;
}
uint GetKernelWidth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelWidth = height;
return kernelWidth;
}
uint4 Dims()
{
return uint4(batch, height, width, channels);
}
uint GetFlatHeight()
{
return batch;
}
uint GetFlatWidth()
{
return height * width * channels;
}
uint GetKernelHeight()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelHeight = batch;
return kernelHeight;
}
uint GetKernelWidth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelWidth = height;
return kernelWidth;
}
uint GetKernelDepth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelDepth = width;
return kernelDepth;
}
uint GetKernelCount()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelCount = channels;
return kernelCount;
}
uint GetLength()
{
return batch * height * width * channels;
}
uint Index(uint b, uint h, uint w, uint ch)
{
uint index =
b * height * width * channels +
h * width * channels +
w * channels +
ch;
return index;
}
uint Index(uint b, uint h, uint w, uint ch)
{
uint index =
b * height * width * channels +
h * width * channels +
w * channels +
ch;
return index;
}
uint Index(uint b, uint i)
{
uint index =
b * height * width * channels +
i;
return index;
}
uint Index(uint b, uint i)
{
uint index =
b * height * width * channels +
i;
return index;
}
StructuredBuffer<float> data;
StructuredBuffer<float> data;
void Init(uint4 nhwc, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
void Init(uint4 nhwc, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return data[Index(b, pos.y, pos.x, ch)];
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return data[Index(b, pos.y, pos.x, ch)];
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
RWStructuredBuffer<float> data;
RWStructuredBuffer<float> data;
void Init(int4 nhwc, RWStructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
void Init(int4 nhwc, RWStructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return Get(b, pos.y, pos.x, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return Get(b, pos.y, pos.x, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
void Set(uint b, uint h, uint w, uint ch, float v)
{
data[Index(b,h,w,ch)] = v;
}
void Set(uint y, uint x, float v)
{
data[Index(y,x)] = v;
}
void Set(uint i, float v)
{
data[i] = v;
}
void Set(uint b, uint h, uint w, uint ch, float v)
{
data[Index(b,h,w,ch)] = v;
}
void Set(uint y, uint x, float v)
{
data[Index(y,x)] = v;
}
void Set(uint i, float v)
{
data[i] = v;
}
StructuredBuffer<float> data;
uint offset;
StructuredBuffer<float> data;
uint offset;
void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
offset = info.x;
}
void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
offset = info.x;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch) + offset];
}
float Get(uint b, uint2 pos, uint ch)
{
return Get(b, pos.y, pos.x, ch);
}
float Get(uint b, uint i)
{
return data[Index(b,i) + offset];
}
float Get(uint i)
{
return data[i + offset];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch) + offset];
}
float Get(uint b, uint2 pos, uint ch)
{
return Get(b, pos.y, pos.x, ch);
}
float Get(uint b, uint i)
{
return data[Index(b,i) + offset];
}
float Get(uint i)
{
return data[i + offset];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return Get(b, pos, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return Get(b, pos, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
};
#define TENSOR_DECL(X) uint4 X##decl[2]; StructuredBuffer<float> X##data;

float fastfma(float a, float b, float c)
{
return dot(float2(a,c), float2(b, 1));
return dot(float2(a,c), float2(b, 1));
}

112
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute


struct TextureAsTensor : Tensor
{
Texture2D<float4> tex;
SamplerState smp;
Texture2D<float4> tex;
SamplerState smp;
Texture2DArray<float4> texArray;
SamplerState smpArray;
Texture2DArray<float4> texArray;
SamplerState smpArray;
void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
{
Tensor::Init(nhwc);
tex = tex_;
smp = sampler_;
texArray = texArray_;
smpArray = samplerArray_;
}
void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
{
Tensor::Init(nhwc);
tex = tex_;
smp = sampler_;
texArray = texArray_;
smpArray = samplerArray_;
}
float4 Get(uint b, uint y, uint x)
{
float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
if (batch > 1)
return texArray.SampleLevel(smpArray, loc, 0);
else
return tex.SampleLevel(smp, loc.xy, 0);
}
float4 Get(uint b, uint y, uint x)
{
float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
if (batch > 1)
return texArray.SampleLevel(smpArray, loc, 0);
else
return tex.SampleLevel(smp, loc.xy, 0);
}
};
#define TENSOR_SHARED2_ARGS3(A, B, S, O) TENSOR_SHARED_ARG(A, S); TENSOR_SHARED_ARG(B, S); TENSOR_ARG_RW(O);

{
// @TODO: currently it fails to compile, needs to be investigated
#if 0
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
TENSOR_SHARED_ARG(K, WBK);
TENSOR_SHARED_ARG(B, WBK);
TENSOR_ARG_RW(O);
TENSOR_SHARED_ARG(K, WBK);
TENSOR_SHARED_ARG(B, WBK);
TENSOR_ARG_RW(O);
// ASSERT(X.channels <= MAX_CHANNELS)
// ASSERT(X.channels <= MAX_CHANNELS)
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
{
acc += in4channels[c] * K.Get(dy, dx, c, k);
}
}
}
float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
{
acc += in4channels[c] * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
#endif
}

23
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md


# Release notes
## 0.2.0
- Version bumped to 0.2.0 as it brings breaking API changes, for details look below.
- Significantly reduced temporary memory allocations by introducing internal allocator support. Now memory is re-used between layer execution as much as possible.
- Improved small workload performance on CSharp backend
- Added parallel implementation for multiple activation functions on CSharp backend
- Added `Peek()` function to `IWorker`, it retains object storage in worker's allocator, useful for quick grabbing of output. If you want to preserve content of output tensor between `Execute()` invocations, then use `Fetch()`.
- Fixed ESRGAN model conversion (ONNX importer).
- Fixed Tensor <-> Texture copy for textures/tensors that dimensions are not multiple of 8.
- Added `Summary()` method to `Worker`. Currently returns allocator information.
- Tabs to spaces! Aiming at higher salary (https://stackoverflow.blog/2017/06/15/developers-use-spaces-make-money-use-tabs/).
- Renamed worker type enum members: `CSharp` -> `CSharpRef`, `CSharpFast` -> `CSharp`, `Compute` -> `ComputeRef`, `ComputeFast` -> `Compute`.
- Implemented new optimized `ComputePrecompiled` worker. This worker caches Compute kernels and state beforehand to reduce CPU overhead.
- Added `ExecuteAsync()` to `IWorker` interface, it returns `IEnumerator`, which enables you to control how many layers to schedule per frame (one iteration == one layer).
- Added `Log` op support on Compute workers.
- Optimized activation functions and ScaleBias by accessing tensor as continuous array. Gained ~2.0ms on 4 batch MobileNet (MBP2016).
- Introduced _Loop version of activations to fight 65535 scheduling limit on D3D11.
- Added .nn as Barracuda model file extension for use in Unity Editor. Also added simple editor importer. Now you can declare serializable fields as NNModel to bind them to .nn asset. ModelLoader.Load() now accepts NNModel as a source.
- Compute: Reduce reference GPU implementation.
- TF importer: Expanded Mean support to mean over channels, implemented Pad (as Border2D), implemented SquaredDifference, added InstanceNormalization and LeakyRelu patterns, StridedSlice implementation.
- TF importer: sort model nodes by dependencies before processing.
- Fixed ComputeBuffer leak when using Compute and ComputePrecompiled backends.
- Made to use Conv2D_L1Cached64_RegisterBlock4x4 more often: improves perf ~2x on Vega 16, and ~30% on Nvidia and Intel.
## 0.1.6
- Added activation type print in verbose mode
- Added fast and parallel CPU implementation for Swish, Relu, Add, Sub, Div, Min, Max, Tanh, Exp

2
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json


{
"name": "com.unity.barracuda",
"displayName": "Barracuda",
"version": "0.1.6-preview",
"version": "0.2.0-preview",
"unity": "2017.4",
"description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.",
"dependencies": {}

2
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs


var elementType = src.GetType().GetElementType();
var elementSize = Marshal.SizeOf(elementType);
var dest = Array.CreateInstance(elementType, shape);
Buffer.BlockCopy(src, 0, dest, 0, src.Length * elementSize);
Buffer.BlockCopy(src, 0, dest, 0, dest.Length * elementSize);
return dest;
}

7
UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs


_barracudaModel = ModelLoader.Load(model.Value);
var executionDevice = inferenceDevice == InferenceDevice.GPU
? BarracudaWorkerFactory.Type.ComputeFast
: BarracudaWorkerFactory.Type.CSharpFast;
? BarracudaWorkerFactory.Type.ComputePrecompiled
: BarracudaWorkerFactory.Type.CSharp;
_engine = BarracudaWorkerFactory.CreateWorker(executionDevice, _barracudaModel, _verbose);
}

var outputs = new List<Tensor>();
foreach (var name in names)
{
var outp = _engine.Fetch(name);
var outp = _engine.Peek(name);
outp.Dispose();
}
return outputs;

354
ml-agents/mlagents/trainers/barracuda.py


from collections import defaultdict
import numpy as np
import json
import struct # convert from Python values and C structs
import struct # convert from Python values and C structs
import re
import argparse
import os.path

self.globals = []
self.memories = []
def __init__(self, **entries):
self.__dict__.update(entries)
def __init__(self, **entries): self.__dict__.update(entries)
parser.add_argument("source_file", help=help)
parser.add_argument("target_file", help="output Barracuda binary file")
parser.add_argument("-trim", "--trim-unused-by-output")
parser.add_argument("--print-layers", action="store_true")
parser.add_argument("--print-source-json", action="store_true")
parser.add_argument("-json", "--print-barracuda-json", action="store_true")
parser.add_argument("--print-layer-links", action="store_true")
parser.add_argument("--print-patterns", action="store_true")
parser.add_argument("--print-tensors", action="store_true")
parser.add_argument("--verbose", action="store_true")
parser.add_argument('source_file', help=help)
parser.add_argument('target_file', help='output Barracuda binary file')
parser.add_argument('-trim', '--trim-unused-by-output')
parser.add_argument('--print-layers', action='store_true')
parser.add_argument('--print-source-json', action='store_true')
parser.add_argument('-json', '--print-barracuda-json', action='store_true')
parser.add_argument('--print-layer-links', action='store_true')
parser.add_argument('--print-patterns', action='store_true')
parser.add_argument('--print-tensors', action='store_true')
parser.add_argument('--verbose', action='store_true')
args.compress_f16 = (
False
) # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
output_extension = ".bc" if not args.compress_f16 else ".f16.bc"
args.compress_f16 = False # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
output_extension = '.bc' if not args.compress_f16 else '.f16.bc'
print("File", args.source_file, "does not exist.")
print('File', args.source_file, 'does not exist.')
return os.path.splitext(os.path.basename(filename))[0] + newExtenstion
return os.path.splitext(os.path.basename(filename))[0] + newExtenstion;
args.target_file = os.path.join(
args.target_file,
replaceFilenameExtension(args.source_file, output_extension),
)
args.target_file = os.path.join(args.target_file, replaceFilenameExtension(args.source_file, output_extension))
if args.verbose:
print(args)

# Fuse training time BatchNorm tensors into Scale & Bias
def fuse_batchnorm_weights(gamma, beta, mean, var, epsilon):
# https://github.com/Tencent/ncnn/blob/master/src/layer/batchnorm.cpp

bias = beta - gamma * mean / np.sqrt(var + epsilon)
return [scale, bias]
if hasattr(model, "layers"):
if hasattr(model, 'layers'):
model = model.layers
inputs_and_memories = set(list(inputs) + list(memories[1::3]))

ready.add(l.name)
return missing
# Class to represent a graph
# Class to represent a graph
class Graph:
def __init__(self, vertices):
self.graph = defaultdict(list) # dictionary containing adjacency List
self.V = vertices # No. of vertices
# function to add an edge to graph
def addEdge(self, u, v):
self.graph[u].append(v)
# A recursive function used by topologicalSort
def topologicalSortUtil(self, v, visited, stack):
# Mark the current node as visited.
class Graph:
def __init__(self,vertices):
self.graph = defaultdict(list) #dictionary containing adjacency List
self.V = vertices #No. of vertices
# function to add an edge to graph
def addEdge(self,u,v):
self.graph[u].append(v)
# A recursive function used by topologicalSort
def topologicalSortUtil(self,v,visited,stack):
# Mark the current node as visited.
# Recur for all the vertices adjacent to this vertex
for i in self.graph[v]:
if visited[i] == False:
self.topologicalSortUtil(i,visited,stack)
# Push current vertex to stack which stores result
stack.insert(0,v)
# Recur for all the vertices adjacent to this vertex
for i in self.graph[v]:
if visited[i] == False:
self.topologicalSortUtil(i, visited, stack)
# Push current vertex to stack which stores result
stack.insert(0, v)
# The function to do Topological Sort. It uses recursive
# topologicalSortUtil()
def topologicalSort(self):
# Mark all the vertices as not visited
visited = [False] * self.V
stack = []
# Call the recursive helper function to store Topological
# Sort starting from all vertices one by one
for i in range(self.V):
if visited[i] == False:
self.topologicalSortUtil(i, visited, stack)
# print(stack)
# The function to do Topological Sort. It uses recursive
# topologicalSortUtil()
def topologicalSort(self):
# Mark all the vertices as not visited
visited = [False]*self.V
stack =[]
# Call the recursive helper function to store Topological
# Sort starting from all vertices one by one
for i in range(self.V):
if visited[i] == False:
self.topologicalSortUtil(i,visited,stack)
#print(stack)
if len(find_missing_inputs(model, inputs_and_memories)) == 0:
if (len(find_missing_inputs(model, inputs_and_memories)) == 0):
return model
g = Graph(len(model))

for l in model:
layers[l.name] = id
layers[l.name] = id;
id += 1
for layer in model:

print("SORTED:", sorted_layer_indices)
new_model = [model[idx] for idx in sorted_layer_indices]
assert len(find_missing_inputs(new_model, inputs_and_memories)) == 0
assert(len(find_missing_inputs(new_model, inputs_and_memories)) == 0)
if hasattr(model, "layers"):
if hasattr(model, 'layers'):
def flatten(items, enter=lambda x: isinstance(x, list)):
def flatten(items,enter=lambda x:isinstance(x, list)):
# http://stackoverflow.com/a/40857703
# https://github.com/ctmakro/canton/blob/master/canton/misc.py
"""Yield items from any nested iterable; see REF."""

yield x
def trim_model(model, outputs):
layers = {l.name: l for l in model}
layers = {l.name:l for l in model}
connected = {o for o in outputs}
while len(outputs) > 0:
outputs = set(flatten([layers[o].inputs for o in outputs if o in layers]))

connected.add(o)
trimmed = [l.name for l in model if l.name not in connected]
return str(arr)[1:-1] # array to string without brackets
return str(arr)[1:-1] # array to string without brackets
print("TRIMMED:", array_without_brackets(trimmed))
return [l for l in model if l.name in connected]

print("Trimming model given outputs to preserve:", preserve_outputs)
model = trim_model(model, preserve_outputs)
else:
print(
"WARNING: Trim couldn't find any layers to match:", criteria_regexp_string
)
print("WARNING: Trim couldn't find any layers to match:", criteria_regexp_string)
compress_classes = {"Dense"}
compress_classes = {
'Dense'
}
if l.class_name in compress_classes:
print(
"Compressing %s layer '%s' weights to float16" % (l.class_name, l.name)
)
if (l.class_name in compress_classes):
print("Compressing %s layer '%s' weights to float16" % (l.class_name, l.name))
if isinstance(o, np.ndarray): # skip binary data packed inside ndarray
if isinstance(o, np.ndarray): # skip binary data packed inside ndarray
if getattr(o, "__dict__", None):
if getattr(o, '__dict__', None):
s = json.dumps(model.layers, cls=StructEncoder, separators=(", ", ":"))
s = json.dumps(model.layers, cls=StructEncoder, separators=(', ',':'))
s = s.replace("]}, {", "]},\n{")
s = s.replace(":[{", ":[\n\t{")
s = s.replace("}, {", "},\n\t{")
s = s.replace(']}, {', ']},\n{')
s = s.replace(':[{', ':[\n\t{')
s = s.replace('}, {', '},\n\t{')
return str(arr)[1:-1] # array to string without brackets
return str(arr)[1:-1] # array to string without brackets
if print_layer_links:
for l in model.layers:

if model.globals:
if isinstance(model.globals, dict):
model.globals = {x.name: x.shape for x in model.globals}
model.globals = {x.name:x.shape for x in model.globals}
ins = {i: model.inputs[i] for i in l.inputs if i in model.inputs}
ins = {i:model.inputs[i] for i in l.inputs if i in model.inputs}
else:
ins = [i for i in l.inputs if i in model.inputs]
if ins:

print("OUT:", array_without_brackets(model.outputs))
if print_tensors:
if (print_tensors):
def __init__(self, scope=""):
def __init__(self, scope=''):
if attr == "_":
if attr == '_':
return self.layers[-1].name if len(self.layer) > 0 else self.scope
raise AttributeError(attr)

i = 1
while name in self.names_taken:
name = self.layers[-1].op + "_" + str(i)
name = self.layers[-1].op + '_' + str(i)
self.layers[-1].name = self.scope + ("/" if self.scope else "") + name
self.layers[-1].name = self.scope + ('/' if self.scope else '') + name
def concat(self, a, b, out=""):
self.layers += [Struct(name=out, op="Concat", input=[a, b])]
def concat(self, a, b, axis=-1, out=''):
self.layers += [Struct(name=out, op='Concat', axis=axis, input=[a, b])]
return self._patch_last_layer_name_and_return()
def mad(self, x, kernel, bias, out=''):
self.layers += [Struct(name=out, op='Dense', input=[x, kernel, bias])]
return self._patch_last_layer_name_and_return()
def mul(self, a, b, out=''):
self.layers += [Struct(name=out, op='Mul', input=[a, b])]
return self._patch_last_layer_name_and_return()
def add(self, a, b, out=''):
self.layers += [Struct(name=out, op='Add', input=[a, b])]
def mad(self, x, kernel, bias, out=""):
self.layers += [Struct(name=out, op="Dense", input=[x, kernel, bias])]
def sub(self, a, b, out=''):
self.layers += [Struct(name=out, op='Sub', input=[a, b])]
def mul(self, a, b, out=""):
self.layers += [Struct(name=out, op="Mul", input=[a, b])]
def sigmoid(self, x, out=''):
self.layers += [Struct(name=out, op='Sigmoid', input=[x])]
def add(self, a, b, out=""):
self.layers += [Struct(name=out, op="Add", input=[a, b])]
def tanh(self, x, out=''):
self.layers += [Struct(name=out, op='Tanh', input=[x])]
def sub(self, a, b, out=""):
self.layers += [Struct(name=out, op="Sub", input=[a, b])]
def reduce(self, op, x, axis=-1, out=''):
self.layers += [Struct(name=out, op='Reduce'+op, axis=axis, input=[x])]
def sigmoid(self, x, out=""):
self.layers += [Struct(name=out, op="Sigmoid", input=[x])]
def pool(self, op, x, out=''):
self.layers += [Struct(name=out, op=op+'Pool', input=[x])]
def tanh(self, x, out=""):
self.layers += [Struct(name=out, op="Tanh", input=[x])]
def strided_slice(self, x, begin, end, strides, rank, out=''):
self.layers += [Struct(name=out, op='StridedSlice', rank=rank, starts=begin, ends=end, slice_strides=strides, input=[x])]
def mean(name, input, axis=-1):
''' combines mean operation out of several simpler ops
'''
nn = Build(name)
if np.array_equal(axis, [1,2]):
nn.pool('GlobalAvg', input, out=name)
elif np.array_equal(axis, [1,2,3]):
nn.reduce('Mean', # over channels
nn.pool('GlobalAvg', input), # over height & width
out=name)
elif np.array_equal(axis, [3]) or np.array_equal(axis, [-1]) or np.array_equal(axis, 3) or np.array_equal(axis, -1):
nn.reduce('Mean', input, out=name)
return nn.layers
def rnn(name, input, state, kernel, bias, new_state, number_of_gates=2):
""" - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
"""
def rnn(name, input, state, kernel, bias, new_state, number_of_gates = 2):
''' - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
'''
nn.tanh(nn.mad(kernel=kernel, bias=bias, x=nn.concat(input, state)), out=new_state)
nn.tanh(
nn.mad(kernel=kernel, bias=bias,
x=nn.concat(input, state)),
out=new_state);
def gru(
name,
input,
state,
kernel_r,
kernel_u,
kernel_c,
bias_r,
bias_u,
bias_c,
new_state,
number_of_gates=2,
):
""" - zt = f(Xt*Wz + Ht_1*Rz + Wbz + Rbz)
def gru(name, input, state, kernel_r, kernel_u, kernel_c, bias_r, bias_u, bias_c, new_state, number_of_gates = 2):
''' - zt = f(Xt*Wz + Ht_1*Rz + Wbz + Rbz)
"""
'''
nn = Build(name)
inputs = nn.concat(input, state)

c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c, x=nn.concat(input, r_state)))
c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c,
x=nn.concat(input, r_state)))
# new_h = u' * state + (1 - u') * c'
# = u' * state + c' - u' * c'

# - u' * c'
nn.sub(nn._, nn.mul(u, c), out=new_state)
return nn.layers
nn.sub(nn._, nn.mul(u, c),
out=new_state)
return nn.layers;
def lstm(
name,
input,
state_c,
state_h,
kernel_i,
kernel_j,
kernel_f,
kernel_o,
bias_i,
bias_j,
bias_f,
bias_o,
new_state_c,
new_state_h,
):
""" Full:
def lstm(name, input, state_c, state_h, kernel_i, kernel_j, kernel_f, kernel_o, bias_i, bias_j, bias_f, bias_o, new_state_c, new_state_h):
''' Full:
- it = f(Xt*Wi + Ht_1*Ri + Pi . Ct_1 + Wbi + Rbi)
- ft = f(Xt*Wf + Ht_1*Rf + Pf . Ct_1 + Wbf + Rbf)
- ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)

"""
'''
""" No peephole:
''' No peephole:
- it = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
- ft = f(Xt*Wf + Ht_1*Rf + Wbf + Rbf)
- ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)

"""
'''
j = nn.tanh(nn.mad(inputs, kernel_j, bias_j))
j = nn.tanh(nn.mad(inputs, kernel_j, bias_j))
nn.add(nn.mul(state_c, f), nn.mul(i, j), out=new_state_c)
nn.add(
nn.mul(state_c, f), nn.mul(i, j),
out=new_state_c)
# new_h =
nn.mul(o, nn.tanh(new_state_c), out=new_state_h)
# new_h =
nn.mul(o, nn.tanh(new_state_c),
out=new_state_h)
# Serialize
class BarracudaWriter:

self.f = open(filename, "wb+")
self.f = open(filename, 'wb+')
def __enter__(self):
return self

def write_str(self, s):
self.write_int32(len(s))
self.f.write(s.encode("ascii"))
self.f.write(s.encode('ascii'))
self.f.write(struct.pack("<f", d))
self.f.write(struct.pack('<f', d))
self.f.write(struct.pack("<i", d))
self.f.write(struct.pack('<i', d))
self.f.write(struct.pack("<q", d))
self.f.write(struct.pack('<q', d))
def write_shape(self, s):
self.write_int32(len(s))

def close(self):
self.f.close()
# VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
#VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
w.write_int64(BARRACUDA_VERSION)
# inputs

w.write_str_array(model.outputs)
# memories
w.write_int32(len(model.memories) // 3)
for mem_shape, mem_in, mem_out in zip(
model.memories[0::3], model.memories[1::3], model.memories[2::3]
):
w.write_int32(len(model.memories)//3)
for mem_shape, mem_in, mem_out in zip(model.memories[0::3], model.memories[1::3], model.memories[2::3]):
w.write_shape(mem_shape)
w.write_str(mem_in)
w.write_str(mem_out)

w.write_int32(len(model.layers))
for l in model.layers:
assert not l.name in l.inputs
assert(not l.name in l.inputs)
w.write_int32(0) # dummy
w.write_int32(0) # dummy
w.write_int32(0) #dummy
w.write_int32(0) #dummy
w.write_shape(l.pads)
w.write_shape(l.strides)
w.write_shape(l.pool_size)

w.write_int32(0) # dummy
w.write_int32(0) #dummy
assert len(x.shape) == 4
assert x.data.nbytes % 4 == 0
length = (
x.data.nbytes >> 2
) # length is measured in float32s (at least for now)
assert(len(x.shape) == 4)
assert(x.data.nbytes % 4 == 0)
length = x.data.nbytes >> 2 # length is measured in float32s (at least for now)
w.write_str(x.name)
w.write_shape(x.shape)

for x in all_tensors:
w.write_array(x.data)

914
ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
文件差异内容过多而无法显示
查看文件

2
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta


fileFormatVersion: 2
guid: 83221ad3db87f4b3b91b041047cb2bc5
guid: 19ed1486aa27d4903b34839f37b8f69f
MonoImporter:
externalObjects: {}
serializedVersion: 2

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta


fileFormatVersion: 2
guid: 4b10c58689ee84c2abe895327686f532
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta


fileFormatVersion: 2
guid: e192a80b369ad4683a329432eeb5ec20
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef


{
"name": "Barracuda-editor",
"references": [],
"includePlatforms": [
"Editor"
],
"excludePlatforms": []
}

7
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta


fileFormatVersion: 2
guid: 9f1e7d835703842dda0e25142ed6c3c9
AssemblyDefinitionImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png

之前 之后
宽度: 64  |  高度: 64  |  大小: 2.3 KiB

106
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta


fileFormatVersion: 2
guid: 57d823f2746e44dc79116df94518bd27
TextureImporter:
fileIDToRecycleName: {}
externalObjects: {}
serializedVersion: 4
mipmaps:
mipMapMode: 0
enableMipMap: 0
sRGBTexture: 0
linearTexture: 0
fadeOut: 0
borderMipMap: 0
mipMapsPreserveCoverage: 0
alphaTestReferenceValue: 0.5
mipMapFadeDistanceStart: 1
mipMapFadeDistanceEnd: 3
bumpmap:
convertToNormalMap: 0
externalNormalMap: 0
heightScale: 0.25
normalMapFilter: 0
isReadable: 0
grayScaleToAlpha: 0
generateCubemap: 6
cubemapConvolution: 0
seamlessCubemap: 0
textureFormat: 1
maxTextureSize: 2048
textureSettings:
serializedVersion: 2
filterMode: -1
aniso: 1
mipBias: -1
wrapU: 1
wrapV: 1
wrapW: -1
nPOTScale: 0
lightmap: 0
compressionQuality: 50
spriteMode: 0
spriteExtrude: 1
spriteMeshType: 1
alignment: 0
spritePivot: {x: 0.5, y: 0.5}
spritePixelsToUnits: 100
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
spriteGenerateFallbackPhysicsShape: 1
alphaUsage: 1
alphaIsTransparency: 1
spriteTessellationDetail: -1
textureType: 2
textureShape: 1
maxTextureSizeSet: 0
compressionQualitySet: 0
textureFormatSet: 0
platformSettings:
- buildTarget: DefaultTexturePlatform
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Standalone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: iPhone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Android
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
spriteSheet:
serializedVersion: 2
sprites: []
outline: []
physicsShape: []
spritePackingTag:
userData:
assetBundleName:
assetBundleVariant:

42
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs


using System.IO;
using UnityEditor;
using UnityEngine;
using UnityEditor.Experimental.AssetImporters;
namespace Barracuda
{
/// <summary>
/// Asset Importer of barracuda models.
/// </summary>
[ScriptedImporter(1, new[] {"nn"})]
public class NNModelImporter : ScriptedImporter {
private const string iconName = "NNModelIcon";
private Texture2D iconTexture;
public override void OnImportAsset(AssetImportContext ctx)
{
var model = File.ReadAllBytes(ctx.assetPath);
var asset = ScriptableObject.CreateInstance<NNModel>();
asset.Value = model;
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
ctx.SetMainObject(asset);
}
private Texture2D LoadIconTexture()
{
if (iconTexture == null)
{
string[] allCandidates = AssetDatabase.FindAssets(iconName);
if (allCandidates.Length > 0)
{
iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
}
}
return iconTexture;
}
}
}

29
UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs


using System.IO;
using UnityEditor;
using UnityEngine;
using UnityEditor.Experimental.AssetImporters;
using MLAgents.InferenceBrain;
namespace MLAgents
{
/// <summary>
/// Asset Importer of barracuda models.
/// </summary>
[ScriptedImporter(1, new[] {"nn"})]
public class NNModelImporter : ScriptedImporter {
private const string IconPath = "Assets/ML-Agents/Resources/NNModelIcon.png";
public override void OnImportAsset(AssetImportContext ctx)
{
var model = File.ReadAllBytes(ctx.assetPath);
var asset = ScriptableObject.CreateInstance<NNModel>();
asset.Value = model;
Texture2D texture = (Texture2D)
AssetDatabase.LoadAssetAtPath(IconPath, typeof(Texture2D));
ctx.AddObjectToAsset(ctx.assetPath, asset, texture);
ctx.SetMainObject(asset);
}
}
}

10
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs


using UnityEngine;
namespace MLAgents.InferenceBrain
{
public class NNModel : ScriptableObject
{
[HideInInspector]
public byte[] Value;
}
}

11
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta


fileFormatVersion: 2
guid: fb1293e6d636b46d09ae35b36241a0c6
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

/UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs.meta → /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

正在加载...
取消
保存