浏览代码

Merge pull request #2049 from Unity-Technologies/develop-barracuda-0.2.0

Barracuda 0.2.1 -> develop
/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
f13d0f11
共有 62 个文件被更改,包括 4891 次插入4000 次删除
  1. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
  2. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
  3. 2
      UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
  4. 2
      UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
  5. 2
      UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
  6. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
  7. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
  8. 2
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
  9. 2
      UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
  10. 2
      UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
  11. 2
      UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
  12. 2
      UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
  13. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
  14. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
  15. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
  16. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
  17. 2
      UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
  18. 2
      UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
  19. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
  20. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
  21. 111
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
  22. 1000
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
  23. 918
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
  24. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
  25. 68
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
  26. 596
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
  27. 632
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
  28. 438
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
  29. 30
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute
  30. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
  31. 214
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
  32. 484
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
  33. 44
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
  34. 480
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc
  35. 112
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute
  36. 57
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
  37. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
  38. 5
      UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
  39. 31
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs
  40. 23
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
  41. 22
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
  42. 15
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs
  43. 15
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
  44. 11
      UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
  45. 370
      ml-agents/mlagents/trainers/barracuda.py
  46. 926
      ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
  47. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
  48. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
  49. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
  50. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
  51. 7
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
  52. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
  53. 106
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
  54. 42
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
  55. 29
      UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
  56. 8
      UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png
  57. 106
      UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png.meta
  58. 10
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
  59. 11
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
  60. 0
      /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.10980392, g: 0.6039216, b: 1, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.5, g: 0.5, b: 0.5, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

111
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md


### Load Model into Barracuda
Once you have your TensorFlow (or ONNX) model converted, you can load resulting Barracuda file via `ModelLoader`:
```C#
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".bytes");
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".nn");
```
Another option is to use editor model importer. Just add public `NNModel` field to your C# script and assing ``.nn`` model file via editor UI:
```C#
public NNModel modelSource;
<..>
var model = ModelLoader.Load(modelSource);
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputeFast, model)
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputePrecompiled, model)
```
### Execute the model

Execution is asynchronous for GPU backends. Currently implementation is synchronous for CPU backends, however it is good to assume that execution will be async for all backends in the future.
### Fetch outputs
If model has only single output, then simple `worker.Fetch()` can be used, otherwise output names should be provided.
If model has only single output, then simple `worker.Peek()` can be used, otherwise output names should be provided.
var O = worker.Fetch(outputName);
var O = worker.Peek(outputName);
_Note:_ ``Peek()`` does not take ownership of the tensor. If you expect to keep tensor for longer time use ``Fetch()``
### Cleanup
As a Barracuda client you are responsible to `Dispose` _worker_, _inputs_ and _outputs_ you fetched. This is necessary to properly free GPU resources.

### Texture as output
If you want to use Barracuda execution results further in the graphics pipeline, you can copy data from `Tensor` into `RenderTexture` without stalling CPU or GPU:
```C#
var tensor = worker.Fetch();
var tensor = worker.Peek();
var texture = BarracudaTextureUtils.TensorToRenderTexture(tensor);
```
If you wish, you can reuse the same `RenderTexture` multiple times:

var tensor = worker.Fetch();
var tensor = worker.Peek();
BarracudaTextureUtils.TensorToRenderTexture(tensor, texture);
```

Convert from TensorFlow:
```bash
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.bytes
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.nn
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.bytes
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.nn
```
If network has multiple outputs, but you need only particular ones during the inference, there is an optional `-trim` flag to remove unused outputs and calculations.

Trim will first remove outputs that do not match regular expression from the graph. In this case only output that ends with `action` will be left.
Next trim will strip all nodes that do not participate in the evaluation of the output.
You could pass `--print-supported-ops` to get approximate list of supported operations/activations for specific converter.
P.S. Python 3.5 or 3.6 is recommended
## Approximate list of supported layers/operations for TensorFlow converter
```
Activation
Add
AvgPool
BatchNormalization
BatchNormalizationRuntime
BiasAdd
Concat
Conv2D
Conv2DBackpropInput
Dense
DepthwiseConv2dNative
Flatten
FusedBatchNorm
GlobalAveragePool
GlobalAvgPool
InstanceNormalization
LRN
MatMul
Max
MaxPool
Maximum
Mean
Min
Minimum
Mul
Multinomial
Nop
OneHot
Pad
Pow
Prod
RandomStandardNormal
RandomUniform
RealDiv
Reshape
ResizeBicubic
ResizeBilinear
ResizeNearestNeighbor
StridedSlice
Sub
Sum
```
## Approximate list of supported activations for TensorFlow converter
```
Abs
Acos
Acosh
Asin
Asinh
Atan
Atanh
Ceil
Cos
Cosh
Elu
Exp
Floor
LeakyRelu
Linear
Log
LogSoftmax
Neg
Relu
Relu6
Selu
Sigmoid
Sin
Sinh
Softmax
Softplus
Softsign
Sqrt
Swish
Tan
Tanh
```
P.S. some of these operations are under limited support and not all configurations are properly supported
P.P.S. We plan to migrate Tensorflow and ONNX converters from Python to C# in the future.
P.P.S. Python 3.5 or 3.6 is recommended
P.P.P.S. We plan to migrate Tensorflow and ONNX converters from Python to C# in the future.

1000
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
文件差异内容过多而无法显示
查看文件

918
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute


#pragma kernel Relu_Flat
#pragma kernel Relu_Loop
#pragma kernel Relu6_Flat
#pragma kernel Relu6_Loop
#pragma kernel Tanh_Flat
#pragma kernel Tanh_Loop
#pragma kernel Swish_Flat
#pragma kernel Swish_Loop
#pragma kernel Sigmoid_Flat
#pragma kernel Sigmoid_Loop
#pragma kernel Elu_Flat
#pragma kernel Elu_Loop
#pragma kernel LeakyRelu_Flat
#pragma kernel LeakyRelu_Loop
#pragma kernel Exp_Flat
#pragma kernel Exp_Loop
#pragma kernel Log_Flat
#pragma kernel Log_Loop
#pragma kernel Pow_Flat
#pragma kernel Pow_Loop
/*
Relu_Flat (NEW) vs Relu_Nyxc+Relu_CNyx+Relu
Compute Precompiled
VGG@1
<<<Exec #128: 59.6 ms, cpu: .9 ms, avg: 62.4 ms, result:OK <--- NEW!
<<<Exec #128: 63.6 ms, cpu: .9 ms, avg: 64.0 ms, result:OK
VGG@4
<<<Exec #16: 276.7 ms, cpu: .9 ms, avg: 272.8 ms, result:OK <--- NEW!
<<<Exec #16: 297.5 ms, cpu: .9 ms, avg: 274.4 ms, result:OK
RES@1
<<<Exec #100: 82.2 ms, cpu: 22.2 ms, avg: 81.0 ms, result:OK <--- NEW!
<<<Exec #100: 82.1 ms, cpu: 22.5 ms, avg: 85.4 ms, result:OK
PPO_2@256
<<<Exec #200: 10.3 ms, cpu: 7.6 ms, avg: 11.9 ms, result:OK <--- NEW!
<<<Exec #200: 10.9 ms, cpu: 8.3 ms, avg: 12.3 ms, result:OK
PPO_CNN@256
<<<Exec #100: 60.6 ms, cpu: 62.3 ms, avg: 65.6 ms, result:OK <--- NEW!
<<<Exec #100: 72.6 ms, cpu: 62.7 ms, avg: 66.0 ms, result:OK
*/
#pragma kernel Relu
#pragma kernel Relu_CNyx
#pragma kernel Relu_Nyxc

#pragma kernel Exp
#pragma kernel Exp_CNyx
#pragma kernel Exp_Nyxc
#pragma kernel Log
#pragma kernel Log_CNyx
#pragma kernel Log_Nyxc
#pragma kernel Pow
#pragma kernel Pow_CNyx
#pragma kernel Pow_Nyxc

TENSOR_DECL_RW(O)
float _Alpha;
uint _LoopStride;
#define FLAT_ACTIVATION(name, op_name) \
void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
if (i > O.GetLength()) return;\
\
float v = X.Get(i);\
v = op_name (v);\
O.Set(i, v);\
}
#define LOOP_ACTIVATION(name, op_name) \
void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
uint len = O.GetLength();\
\
while (i < len) {\
float v = X.Get(i); \
v = op_name (v); \
O.Set(i, v); \
i += _LoopStride; \
}\
}
#define ACTIVATION(name, op_name) \
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
FLAT_ACTIVATION(name, op_name)\
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
LOOP_ACTIVATION(name, op_name)
return 0.5f * (v + abs(v));
return 0.5f * (v + abs(v));
return min(max(0, v), 6);
return min(max(0, v), 6);
return v / (1.f + exp(-v));
return v / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
return max(v, _Alpha * v);
return max(v, _Alpha * v);
float signed_pow(float f, float e)
float signed_pow(float f)
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
float e = _Alpha;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
ACTIVATION(Relu, relu)
ACTIVATION(Relu6, relu6)
ACTIVATION(Tanh, tanh)
ACTIVATION(Sigmoid, sigmoid)
ACTIVATION(Swish, swish)
ACTIVATION(Elu, elu)
ACTIVATION(LeakyRelu, lrelu)
ACTIVATION(Exp, exp)
ACTIVATION(Log, log)
ACTIVATION(Pow, signed_pow)
// -------------------
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void Log(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((16,16,1), (16,8,1), (16,4,1))
void Log_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))
void Log_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}

DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
文件差异内容过多而无法显示
查看文件

68
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute


NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastAdd(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastSub(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMul(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastDiv(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

float signed_pow(float f, float e)
{
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMin(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMax(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

596
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute


#pragma kernel Conv2D
#pragma kernel Conv2D_RegisterBlock4x2
//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4
#pragma kernel DepthwiseConv2D

NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
}

void Conv2D_RegisterBlock4x2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
#undef L1CACHESIZE
#define L1CACHESIZE 64
#undef SIZE
#define SIZE 4
groupshared float Conv2D_L1Cached64_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];
[numthreads(L1CACHESIZE, 1, 1)]
void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
#define CONV2D_L1CACHED(L1CACHESIZE, SIZE, FMA) \
groupshared float Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];\
[numthreads(L1CACHESIZE, 1, 1)]\
void Conv2D_L1Cached##L1CACHESIZE##_RegisterBlock##SIZE##x##SIZE(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)\
{\
DISPATCH_ARGS(K.kernelCount, O.width, O.height);\
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);\
\
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;\
uint x = groupID.y;\
uint y = groupID.z;\
\
if (x*SIZE >= O.width) return;\
if (y*SIZE >= O.height) return;\
\
for (uint n = 0; n < O.batch; ++n)\
{\
float acc[SIZE*SIZE];\
[unroll]\
for (uint q = 0; q < SIZE*SIZE; ++q)\
acc[q] = B.SafeGet(k);\
\
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)\
{\
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)\
{\
uint2 pos[SIZE*SIZE];\
[unroll]\
for (uint q = 0; q < SIZE*SIZE; ++q)\
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);\
\
for (uint c = 0; c < X.channels; c += L1CACHESIZE)\
{\
uint dc = groupThreadID.x;\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);\
GroupMemoryBarrierWithGroupSync();\
\
if (k < K.channels)\
{\
uint kIndex = K.Index(dy, dx, c, k);\
for (dc = 0; dc < L1CACHESIZE; ++dc)\
{\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
acc[q] = FMA(Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc], K.data[kIndex], acc[q]);\
kIndex += K.channels;\
}\
}\
GroupMemoryBarrierWithGroupSync();\
}\
}\
}\
\
uint remainderW = (O.width - x*SIZE);\
uint remainderH = (O.height - y*SIZE);\
\
if (k < K.channels)\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
if (q/SIZE < remainderH && q%SIZE < remainderW)\
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);\
}\
\
}
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
CONV2D_L1CACHED(64,4, fastfma)
CONV2D_L1CACHED(32,4, fastfma)
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
#undef X_
}
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
O.Set(n, y, x, k, acc);
}
}
}

{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
}
#undef L1CACHESIZE

[numthreads(L1CACHESIZE, 1, 1)]
void Conv2DTrans_L1Cached64_RegisterBlock2x2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width / SIZE, X.height / SIZE);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
#define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= X.width) return;
if (y*SIZE >= X.height) return;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= X.width) return;
if (y*SIZE >= X.height) return;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 pad = _Pad.xy / _Stride.xy;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) + uint2(dx+sx, dy+sy) / _Stride.xy;
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
GroupMemoryBarrierWithGroupSync();
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
//uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma( X_[q][dc],
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c + dc, k),
acc[q]);
//kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
//uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma( X_[q][dc],
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c + dc, k),
acc[q]);
//kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
{
uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
if (ox < O.width && oy < O.height)
O.Set(n, oy, ox, k, acc[q]);
}
}
}
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
{
uint ox = (x*SIZE+(q%SIZE)) * _Stride.x + sx;
uint oy = (y*SIZE+(q/SIZE)) * _Stride.y + sy;
if (ox < O.width && oy < O.height)
O.Set(n, oy, ox, k, acc[q]);
}
}
}
}
#undef X_
#undef X_
}

632
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute


NUMTHREADS((16,8,1), (16,8,1), (16,4,1))
void Conv2D_Kmod16_Nmod8_KNY(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.channels, O.batch, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.channels, O.batch, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint n = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint n = dispatchThreadID.y;
uint y = dispatchThreadID.z;
for (uint x = 0; x < O.width; ++x)
{
float v = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
for (uint x = 0; x < O.width; ++x)
{
float v = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
for (uint c = 0; c < X.channels; ++c)
{
v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, v);
}
for (uint c = 0; c < X.channels; ++c)
{
v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, v);
}
}
#undef CTILE

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod32_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_Xcache
#define K_ Conv_Kcache
#define X_ Conv_Xcache
#define K_ Conv_Kcache
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint width = O.width;
uint height = O.height;
uint width = O.width;
uint height = O.height;
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
float x0 = 0;
float x1 = 0;
float x2 = 0;
float x3 = 0;
if (mask)
{
x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
}
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
float x0 = 0;
float x1 = 0;
float x2 = 0;
float x3 = 0;
if (mask)
{
x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
}
float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
//X_[gy][gx] = float4(x0, x1,
// x2, x3);
//K_[gy][gx] = float4(k0, k1,
// k2, k3);
X_[0][gy][gx] = x0;
X_[1][gy][gx] = x1;
X_[2][gy][gx] = x2;
X_[3][gy][gx] = x3;
//X_[gy][gx] = float4(x0, x1,
// x2, x3);
//K_[gy][gx] = float4(k0, k1,
// k2, k3);
X_[0][gy][gx] = x0;
X_[1][gy][gx] = x1;
X_[2][gy][gx] = x2;
X_[3][gy][gx] = x3;
K_[0][gy][gx] = k0;
K_[1][gy][gx] = k1;
K_[2][gy][gx] = k2;
K_[3][gy][gx] = k3;
K_[0][gy][gx] = k0;
K_[1][gy][gx] = k1;
K_[2][gy][gx] = k2;
K_[3][gy][gx] = k3;
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x = //X_[gy][i];
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k = //K_[i][gx];
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x = //X_[gy][i];
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k = //K_[i][gx];
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
//v.x += k.x*x.x + k.z*x.y;
//v.y += k.y*x.x + k.w*x.y;
//v.z += k.x*x.z + k.z*x.w;
//v.w += k.y*x.z + k.w*x.w;
}
//v.x += k.x*x.x + k.z*x.y;
//v.y += k.y*x.x + k.w*x.y;
//v.z += k.x*x.z + k.z*x.w;
//v.w += k.y*x.z + k.w*x.w;
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
O.Set(n*2+0, y, x, k*2+0, v.x);
O.Set(n*2+0, y, x, k*2+1, v.y);
O.Set(n*2+1, y, x, k*2+0, v.z);
O.Set(n*2+1, y, x, k*2+1, v.w);
#undef X_
#undef K_
O.Set(n*2+0, y, x, k*2+0, v.x);
O.Set(n*2+0, y, x, k*2+1, v.y);
O.Set(n*2+1, y, x, k*2+0, v.z);
O.Set(n*2+1, y, x, k*2+1, v.w);
#undef X_
#undef K_
}
#undef CTILE

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod32_KNyxDiv2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_Xcache2
#define K_ Conv_Kcache2
#define X_ Conv_Xcache2
#define K_ Conv_Kcache2
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint width = O.width / 2;
uint height = O.height;
uint width = O.width / 2;
uint height = O.height;
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
uint x = nyx % width;
uint ny = nyx / width;
uint y = ny % height;
uint n = ny / height;
float b0 = B.Get(k*2+0);
float b1 = B.Get(k*2+1);
float4 v = float4(b0, b1,
b0, b1);
bool mask = n < O.batch;
bool mask = n < O.batch;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
bool maskY = mask;
uint oy = y * _Stride.y + dy;
if (oy < _Pad.y) maskY = false;
if (oy - _Pad.w >= X.height) maskY = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
bool maskY = mask;
uint oy = y * _Stride.y + dy;
if (oy < _Pad.y) maskY = false;
if (oy - _Pad.w >= X.height) maskY = false;
bool maskL = maskY;
uint oxL = (x*2+0) * _Stride.x + dx;
if (oxL < _Pad.x) maskL = false;
if (oxL - _Pad.z >= X.width) maskL = false;
bool maskL = maskY;
uint oxL = (x*2+0) * _Stride.x + dx;
if (oxL < _Pad.x) maskL = false;
if (oxL - _Pad.z >= X.width) maskL = false;
bool maskR = maskY;
uint oxR = (x*2+1) * _Stride.x + dx;
if (oxR < _Pad.x) maskR = false;
if (oxR - _Pad.z >= X.width) maskR = false;
bool maskR = maskY;
uint oxR = (x*2+1) * _Stride.x + dx;
if (oxR < _Pad.x) maskR = false;
if (oxR - _Pad.z >= X.width) maskR = false;
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
if (maskL)
{
X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[0][gy][gx] = X_[1][gy][gx] = 0;
}
for (uint m = 0; m < X.channels/(CTILE*2); ++m)
{
if (maskL)
{
X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[0][gy][gx] = X_[1][gy][gx] = 0;
}
if (maskR)
{
X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[2][gy][gx] = X_[3][gy][gx] = 0;
}
if (maskR)
{
X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
}
else
{
X_[2][gy][gx] = X_[3][gy][gx] = 0;
}
K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x =
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k =
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
}
[unroll]
for (uint i = 0; i < CTILE; ++i)
{
float4 x =
float4( X_[0][gy][i],
X_[1][gy][i],
X_[2][gy][i],
X_[3][gy][i]);
float4 k =
float4( K_[0][i][gx],
K_[1][i][gx],
K_[2][i][gx],
K_[3][i][gx]);
v.x = mad(k.x, x.x, v.x);
v.x = mad(k.z, x.y, v.x);
v.y = mad(k.y, x.x, v.y);
v.y = mad(k.w, x.y, v.y);
v.z = mad(k.x, x.z, v.z);
v.z = mad(k.z, x.w, v.z);
v.w = mad(k.y, x.z, v.w);
v.w = mad(k.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
O.Set(n, y, x*2+0, k*2+0, v.x);
O.Set(n, y, x*2+0, k*2+1, v.y);
if (mask && x*2+1 < O.width)
{
O.Set(n, y, x*2+1, k*2+0, v.z);
O.Set(n, y, x*2+1, k*2+1, v.w);
}
O.Set(n, y, x*2+0, k*2+0, v.x);
O.Set(n, y, x*2+0, k*2+1, v.y);
if (mask && x*2+1 < O.width)
{
O.Set(n, y, x*2+1, k*2+0, v.z);
O.Set(n, y, x*2+1, k*2+1, v.w);
}
#undef X_
#undef K_
#undef X_
#undef K_
}

[numthreads(CTILE, CTILE, 1)]
void Conv2D_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint gx = groupThreadID.x;
uint gy = groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint k = CTILE * groupID.x + groupThreadID.x;
uint nyx = CTILE * groupID.y + groupThreadID.y;
uint x = nyx % O.width;
uint ny = nyx / O.width;
uint y = ny % O.height;
uint n = ny / O.height;
uint x = nyx % O.width;
uint ny = nyx / O.width;
uint y = ny % O.height;
uint n = ny / O.height;
float v[RTILE][RTILE];
for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
{
float b = B.Get(k*RTILE+xxxx);
for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
v[yyyy][xxxx] = b;
}
float v[RTILE][RTILE];
for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
{
float b = B.Get(k*RTILE+xxxx);
for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
v[yyyy][xxxx] = b;
}
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
bool mask = true;
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) mask = false;
if (oy - _Pad.w >= X.height) mask = false;
if (ox < _Pad.x) mask = false;
if (ox - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
{
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
{
if (mask)
X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
else
X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
}
for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
{
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
{
if (mask)
X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
else
X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint ii = 0; ii < CTILE; ++ii)
{
float x[RTILE][RTILE];
float k[RTILE][RTILE];
for (uint ii = 0; ii < CTILE; ++ii)
{
float x[RTILE][RTILE];
float k[RTILE][RTILE];
[unroll]
for (uint yy = 0; yy < RTILE; ++yy)
{
[unroll]
for (uint xx = 0; xx < RTILE; ++xx)
{
x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
}
}
[unroll]
for (uint yy = 0; yy < RTILE; ++yy)
{
[unroll]
for (uint xx = 0; xx < RTILE; ++xx)
{
x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
}
}
[unroll]
for (uint yyy = 0; yyy < RTILE; ++yyy)
{
[unroll]
for (uint xxx = 0; xxx < RTILE; ++xxx)
{
[unroll]
for (uint i = 0; i < RTILE; ++i)
{
v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
}
}
}
}
[unroll]
for (uint yyy = 0; yyy < RTILE; ++yyy)
{
[unroll]
for (uint xxx = 0; xxx < RTILE; ++xxx)
{
[unroll]
for (uint i = 0; i < RTILE; ++i)
{
v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
}
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
#undef X_
#undef K_
for (uint yy = 0; yy < RTILE; ++yy)
for (uint xx = 0; xx < RTILE; ++xx)
O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
#undef X_
#undef K_
}

438
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute


#pragma kernel Dense_L1Cached64
#pragma kernel DenseTiled16x16
//#pragma kernel DenseTiled32x32
//#pragma kernel DenseTiled64x64
#pragma kernel DenseTiled32x32
#pragma kernel DenseTiled64x64
#include "Tensor.cginc"

[numthreads(CACHESIZE, 1, 1)]
void Dense_L1Cached64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ Dense_L1Cached64_X
#define X_ Dense_L1Cached64_X
uint x = CACHESIZE * groupID.x + groupThreadID.x;
uint y = groupID.y;
uint x = CACHESIZE * groupID.x + groupThreadID.x;
uint y = groupID.y;
uint wIndex = W.Index(0, x);
uint wIndex = W.Index(0, x);
float acc = B.Get(x);
// loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
{
// Cache X
// coalescent reads
X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
GroupMemoryBarrierWithGroupSync();
float acc = B.Get(x);
// loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps
for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE)
{
// Cache X
// coalescent reads
X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x);
GroupMemoryBarrierWithGroupSync();
// X * W
if (i + CACHESIZE <= X.GetFlatWidth())
{
[unroll]
for (uint di = 0; di < CACHESIZE; ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
else
{
// handle remainder of the line < CACHESIZE
for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
// X * W
if (i + CACHESIZE <= X.GetFlatWidth())
{
[unroll]
for (uint di = 0; di < CACHESIZE; ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
else
{
// handle remainder of the line < CACHESIZE
for (uint di = 0; i + di < X.GetFlatWidth(); ++di)
{
acc = fastfma(X_[di], W.data[wIndex], acc);
wIndex += W.GetFlatWidth();
}
}
GroupMemoryBarrierWithGroupSync();
}
GroupMemoryBarrierWithGroupSync();
}
// needed all threads to load matrix line, x might be out of the bounds for writing
if (x < O.GetFlatWidth())
O.Set(y, x, acc);
// needed all threads to load matrix line, x might be out of the bounds for writing
if (x < O.GetFlatWidth())
O.Set(y, x, acc);
#undef X_
#undef X_
}

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache
#define W_ DenseTiled_Wcache
#define X_ DenseTiled_Xcache
#define W_ DenseTiled_Wcache
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight());
float v = B.Get(x);
for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
{
if (mask)
{
X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
}
else
{
X_[ty][tx] = 0;
W_[ty][tx] = 0;
}
float v = B.Get(x);
for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m)
{
if (mask)
{
X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx);
W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x);
}
else
{
X_[ty][tx] = 0;
W_[ty][tx] = 0;
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
v = fastfma(X_[ty][i], W_[i][tx], v);
}
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
v = fastfma(X_[ty][i], W_[i][tx], v);
}
GroupMemoryBarrierWithGroupSync();
}
if (mask)
O.Set(y, x, v);
GroupMemoryBarrierWithGroupSync();
}
if (mask)
O.Set(y, x, v);
#undef X_
#undef W_
#undef X_
#undef W_
}
#undef TILE_WIDTH

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled32x32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth / 2, O.flatHeight / 2, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache32
#define W_ DenseTiled_Wcache32
#define X_ DenseTiled_Xcache32
#define W_ DenseTiled_Wcache32
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
float b0 = B.Get(x*2+0);
float b1 = B.Get(x*2+1);
float4 v = float4(b0, b1,
b0, b1);
float b0 = B.Get(x*2+0);
float b1 = B.Get(x*2+1);
float4 v = float4(b0, b1,
b0, b1);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
{
float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);)
{
float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0);
float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1);
float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0);
float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1);
float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0);
float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1);
float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0);
float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1);
++m;
++m;
X_[0][ty][tx] = x0;
X_[1][ty][tx] = x1;
X_[2][ty][tx] = x2;
X_[3][ty][tx] = x3;
X_[0][ty][tx] = x0;
X_[1][ty][tx] = x1;
X_[2][ty][tx] = x2;
X_[3][ty][tx] = x3;
W_[0][ty][tx] = w0;
W_[1][ty][tx] = w1;
W_[2][ty][tx] = w2;
W_[3][ty][tx] = w3;
W_[0][ty][tx] = w0;
W_[1][ty][tx] = w1;
W_[2][ty][tx] = w2;
W_[3][ty][tx] = w3;
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
float4 x =
float4( X_[0][ty][i],
X_[1][ty][i],
X_[2][ty][i],
X_[3][ty][i]);
float4 w =
float4( W_[0][i][tx],
W_[1][i][tx],
W_[2][i][tx],
W_[3][i][tx]);
v.x = fastfma(w.x, x.x, v.x);
v.y = fastfma(w.y, x.x, v.y);
v.z = fastfma(w.x, x.z, v.z);
v.w = fastfma(w.y, x.z, v.w);
[unroll]
for (uint i = 0; i < TILE_WIDTH; ++i)
{
float4 x =
float4( X_[0][ty][i],
X_[1][ty][i],
X_[2][ty][i],
X_[3][ty][i]);
float4 w =
float4( W_[0][i][tx],
W_[1][i][tx],
W_[2][i][tx],
W_[3][i][tx]);
v.x = fastfma(w.x, x.x, v.x);
v.y = fastfma(w.y, x.x, v.y);
v.z = fastfma(w.x, x.z, v.z);
v.w = fastfma(w.y, x.z, v.w);
v.x = fastfma(w.z, x.y, v.x);
v.y = fastfma(w.w, x.y, v.y);
v.z = fastfma(w.z, x.w, v.z);
v.w = fastfma(w.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
O.Set(y*2+0, x*2+0, v.x);
O.Set(y*2+0, x*2+1, v.y);
O.Set(y*2+1, x*2+0, v.z);
O.Set(y*2+1, x*2+1, v.w);
v.x = fastfma(w.z, x.y, v.x);
v.y = fastfma(w.w, x.y, v.y);
v.z = fastfma(w.z, x.w, v.z);
v.w = fastfma(w.w, x.w, v.w);
}
GroupMemoryBarrierWithGroupSync();
}
O.Set(y*2+0, x*2+0, v.x);
O.Set(y*2+0, x*2+1, v.y);
O.Set(y*2+1, x*2+0, v.z);
O.Set(y*2+1, x*2+1, v.w);
#undef X_
#undef W_
#undef X_
#undef W_
}
#undef TILE_WIDTH

[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.flatWidth / 4, O.flatHeight / 4, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_Xcache64
#define W_ DenseTiled_Wcache64
#define X_ DenseTiled_Xcache64
#define W_ DenseTiled_Wcache64
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
uint tx = groupThreadID.x;
uint ty = groupThreadID.y;
uint x = groupID.x*TILE_WIDTH + tx;
uint y = groupID.y*TILE_WIDTH + ty;
float b0 = B.Get(x*4+0);
float b1 = B.Get(x*4+1);
float b2 = B.Get(x*4+2);
float b3 = B.Get(x*4+3);
float4 v0, v1, v2, v3;
v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
float b0 = B.Get(x*4+0);
float b1 = B.Get(x*4+1);
float b2 = B.Get(x*4+2);
float b3 = B.Get(x*4+3);
float4 v0, v1, v2, v3;
v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m)
{
for (uint yy = 0; yy < 4; ++yy)
for (uint xx = 0; xx < 4; ++xx)
{
X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
}
GroupMemoryBarrierWithGroupSync();
for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m)
{
for (uint yy = 0; yy < 4; ++yy)
for (uint xx = 0; xx < 4; ++xx)
{
X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx);
W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx);
}
GroupMemoryBarrierWithGroupSync();
for (uint i = 0; i < TILE_WIDTH; ++i)
{
[unroll]
for (uint q = 0; q < 4; ++q)
{
float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
for (uint i = 0; i < TILE_WIDTH; ++i)
{
[unroll]
for (uint q = 0; q < 4; ++q)
{
float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
v0.x = fastfma(x0, w0, v0.x); //--
v1.x = fastfma(x1, w0, v1.x);
v2.x = fastfma(x2, w0, v2.x);
v3.x = fastfma(x3, w0, v3.x);
v0.y = fastfma(x0, w1, v0.y); //--
v1.y = fastfma(x1, w1, v1.y);
v2.y = fastfma(x2, w1, v2.y);
v3.y = fastfma(x3, w1, v3.y);
v0.z = fastfma(x0, w2, v0.z); //--
v1.z = fastfma(x1, w2, v1.z);
v2.z = fastfma(x2, w2, v2.z);
v3.z = fastfma(x3, w2, v3.z);
v0.w = fastfma(x0, w3, v0.w); //--
v1.w = fastfma(x1, w3, v1.w);
v2.w = fastfma(x2, w3, v2.w);
v3.w = fastfma(x3, w3, v3.w);
}
v0.x = fastfma(x0, w0, v0.x); //--
v1.x = fastfma(x1, w0, v1.x);
v2.x = fastfma(x2, w0, v2.x);
v3.x = fastfma(x3, w0, v3.x);
v0.y = fastfma(x0, w1, v0.y); //--
v1.y = fastfma(x1, w1, v1.y);
v2.y = fastfma(x2, w1, v2.y);
v3.y = fastfma(x3, w1, v3.y);
v0.z = fastfma(x0, w2, v0.z); //--
v1.z = fastfma(x1, w2, v1.z);
v2.z = fastfma(x2, w2, v2.z);
v3.z = fastfma(x3, w2, v3.z);
v0.w = fastfma(x0, w3, v0.w); //--
v1.w = fastfma(x1, w3, v1.w);
v2.w = fastfma(x2, w3, v2.w);
v3.w = fastfma(x3, w3, v3.w);
}
GroupMemoryBarrierWithGroupSync();
}
}
GroupMemoryBarrierWithGroupSync();
}
}
O.Set(y*4+0, x*4+0, v0.x);
O.Set(y*4+0, x*4+1, v0.y);
O.Set(y*4+0, x*4+2, v0.z);
O.Set(y*4+0, x*4+3, v0.w);
O.Set(y*4+0, x*4+0, v0.x);
O.Set(y*4+0, x*4+1, v0.y);
O.Set(y*4+0, x*4+2, v0.z);
O.Set(y*4+0, x*4+3, v0.w);
O.Set(y*4+1, x*4+0, v1.x);
O.Set(y*4+1, x*4+1, v1.y);
O.Set(y*4+1, x*4+2, v1.z);
O.Set(y*4+1, x*4+3, v1.w);
O.Set(y*4+1, x*4+0, v1.x);
O.Set(y*4+1, x*4+1, v1.y);
O.Set(y*4+1, x*4+2, v1.z);
O.Set(y*4+1, x*4+3, v1.w);
O.Set(y*4+2, x*4+0, v2.x);
O.Set(y*4+2, x*4+1, v2.y);
O.Set(y*4+2, x*4+2, v2.z);
O.Set(y*4+2, x*4+3, v2.w);
O.Set(y*4+2, x*4+0, v2.x);
O.Set(y*4+2, x*4+1, v2.y);
O.Set(y*4+2, x*4+2, v2.z);
O.Set(y*4+2, x*4+3, v2.w);
O.Set(y*4+3, x*4+0, v3.x);
O.Set(y*4+3, x*4+1, v3.y);
O.Set(y*4+3, x*4+2, v3.z);
O.Set(y*4+3, x*4+3, v3.w);
#undef X_
#undef W_
O.Set(y*4+3, x*4+0, v3.x);
O.Set(y*4+3, x*4+1, v3.y);
O.Set(y*4+3, x*4+2, v3.z);
O.Set(y*4+3, x*4+3, v3.w);
#undef X_
#undef W_
}

30
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute


float2 Unpack(SharedTensor t, uint y, uint x)
{
uint v = asuint(t.data[t.Index(y, x) >> 1]);
// TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
// using custom conversion function for now
//return float2(f16tof32(v), f16tof32(v>>16));
return float2(f16tof32_(v), f16tof32_(v>>16));
uint v = asuint(t.data[t.Index(y, x) >> 1]);
// TEMPORARY: f16tof32 is broken in GLSL/Metal compiler
// using custom conversion function for now
//return float2(f16tof32(v), f16tof32(v>>16));
return float2(f16tof32_(v), f16tof32_(v>>16));
}
// NOTE: usually this path is used for <16 batches

DISPATCH_ARGS(O.flatWidth/2, O.flatHeight, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
float2 acc = Unpack(B, 0, x*2);
for (uint i = 0; i < X.width; ++i)
{
float2 w = Unpack(W, i, x*2);
acc += X.Get(y, i) * w;
}
float2 acc = Unpack(B, 0, x*2);
for (uint i = 0; i < X.width; ++i)
{
float2 w = Unpack(W, i, x*2);
acc += X.Get(y, i) * w;
}
O.Set(y, x*2+0, acc[0]);
O.Set(y, x*2+1, acc[1]);
O.Set(y, x*2+0, acc[0]);
O.Set(y, x*2+1, acc[1]);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
文件差异内容过多而无法显示
查看文件

214
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute


[numthreads(THREAD_COUNT, 1, 1)]
void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
#define X_ DenseTiled_XcacheR
#define W_ DenseTiled_WcacheR
#define X_ DenseTiled_XcacheR
#define W_ DenseTiled_WcacheR
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
float v[BLOCK_WIDTH][BLOCK_WIDTH];
for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
float v[BLOCK_WIDTH][BLOCK_WIDTH];
for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
}
GroupMemoryBarrierWithGroupSync();
}
GroupMemoryBarrierWithGroupSync();
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
#undef X_
#undef W_
#undef X_
#undef W_
}

[numthreads(THREAD_COUNT, 1, 1)]
void Conv2D_Kernel3x3_64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// @TODO: DISPATCH_ARGS(...)
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
#define X_ Conv_XcacheR
#define K_ Conv_KcacheR
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint id = groupThreadID.x;
uint bx = groupID.x;
uint by = groupID.y;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint bbx = id % BLOCK_WIDTH;
uint bby = id / BLOCK_WIDTH;
uint width = O.width;
uint height = O.height;
uint width = O.width;
uint height = O.height;
// ASSERT(LOAD_WIDTH == THREAD_COUNT)
uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
uint loadX = loadNYX % width;
uint loadNY = loadNYX / width;
uint loadY = loadNY % height;
uint loadN = loadNY / height;
// ASSERT(LOAD_WIDTH == THREAD_COUNT)
uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
uint loadX = loadNYX % width;
uint loadNY = loadNYX / width;
uint loadY = loadNY % height;
uint loadN = loadNY / height;
// @TODO: validate that _Stride works, added the following 2 lines without testing
loadX *= _Stride.x;
loadY *= _Stride.y;
// @TODO: validate that _Stride works, added the following 2 lines without testing
loadX *= _Stride.x;
loadY *= _Stride.y;
float v[BLOCK_WIDTH][BLOCK_WIDTH];
[unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
[unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
float v[BLOCK_WIDTH][BLOCK_WIDTH];
[unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
[unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
{
float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
v[yy][xx] = bias;
}
for (uint dy = 0; dy < 3; ++dy)
{
bool mask = true;
for (uint dy = 0; dy < 3; ++dy)
{
bool mask = true;
if (loadY+dy < _Pad.y) mask = false;
if (loadY+dy - _Pad.w >= X.height) mask = false;
if (loadY+dy < _Pad.y) mask = false;
if (loadY+dy - _Pad.w >= X.height) mask = false;
for (uint dx = 0; dx < 3; ++dx)
{
if (loadX+dx < _Pad.x) mask = false;
if (loadX+dx - _Pad.z >= X.width) mask = false;
for (uint dx = 0; dx < 3; ++dx)
{
if (loadX+dx < _Pad.x) mask = false;
if (loadX+dx - _Pad.z >= X.width) mask = false;
for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
if (mask)
X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
else
X_[q][id] = 0;
K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
{
for (uint q = 0; q < LOAD_DEPTH; ++q)
{
if (mask)
X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
else
X_[q][id] = 0;
K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
}
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
}
for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
[unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
{
v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
}
GroupMemoryBarrierWithGroupSync();
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
[unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
{
uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
uint saveX = saveNYX % width;
uint saveNY = saveNYX / width;
uint saveY = saveNY % height;
uint saveN = saveNY / height;
[unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
[unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
{
uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
uint saveX = saveNYX % width;
uint saveNY = saveNYX / width;
uint saveY = saveNY % height;
uint saveN = saveNY / height;
uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
}
uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
}
#undef X_
#undef K_
#undef X_
#undef K_
}

484
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute


#pragma kernel ScaleBias
#pragma kernel ScaleBias
#pragma kernel ScaleBias_CNyx2
#pragma kernel ScaleBias_Flat
#pragma kernel Upsample2D
#pragma kernel AvgPool2D
#pragma kernel MaxPool2D

#pragma kernel InstanceNorm
#pragma kernel Copy
/*
ScaleBias_Flat+ScaleBias_CNyx2 (NEW) vs ScaleBias+ScaleBias_CNyx
Compute Precompiled
MOBILENET@4
<<<Exec #64: 66.5 ms, cpu: 7.7 ms, avg: 66.3 ms, result:OK <--- NEW!
<<<Exec #64: 66.7 ms, cpu: 8.0 ms, avg: 67.1 ms, result:OK
*/
#include "Tensor.cginc"
TENSOR_DECL(X)

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void ScaleBias(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
NUMTHREADS((256,1,1), (128,1,1), (64,1,1))
void ScaleBias_Flat(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.length, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
if (c >= X.channels) return;
if (n >= X.batch) return;
uint i = dispatchThreadID.x;
if (i > O.GetLength()) return;
float bias = B.Get(0, 0, 0, c);
float scale = W.Get(0, 0, 0, c);
uint c = i % X.channels;
float bias = B.Get(c);
float scale = W.Get(c);
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
float v = X.Get(i);
v = v * scale + bias;
O.Set(i, v);
}
NUMTHREADS((32,4,1), (32,2,1), (16,2,1))
void ScaleBias_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
uint i = dispatchThreadID.y * X.channels + c;
if (c >= X.channels) return;
if (i >= X.GetLength()) return;
float bias = B.Get(c);
float scale = W.Get(c);
float v = X.Get(i);
v = v * scale + bias;
O.Set(i, v);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= X.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
if (c >= X.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float v = X.Get(n, y, x, c);
for (uint n = 0; n < O.batch; ++n)
{
float v = X.Get(n, y, x, c);
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Pool.y + dy;
uint ox = x * _Pool.x + dx;
O.Set(n, oy, ox, c, v);
}
}
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Pool.y + dy;
uint ox = x * _Pool.x + dx;
O.Set(n, oy, ox, c, v);
}
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float acc = 0;
float counter = 0;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < X.batch; ++n)
{
float acc = 0;
float counter = 0;
for (uint dy = 0; dy < _Pool.y; ++dy)
for (uint dx = 0; dx < _Pool.x; ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.w < X.height) && (ox - _Pad.z < X.width);
acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
counter += (mask)? 1: 0;
}
acc /= counter;
O.Set(n, y, x, c, acc);
}
bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
counter += (mask)? 1: 0;
}
acc /= counter;
O.Set(n, y, x, c, acc);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
{
float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
for (uint n = 0; n < X.batch; ++n)
{
float maxV = -FLT_MAX;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
{
float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
maxV = max(v, maxV);
}
O.Set(n, y, x, c, maxV);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint dy = 0; dy < _Pool[1]; ++dy)
for (uint dx = 0; dx < _Pool[0]; ++dx)
v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
O.Set(n, y, x, c, v);
}
O.Set(n, y, x, c, v);
}
}
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))

DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v0 = X.Get(n, y*2, x*2, c);
float v1 = X.Get(n, y*2+1, x*2, c);
float v2 = X.Get(n, y*2, x*2+1, c);
float v3 = X.Get(n, y*2+1, x*2+1, c);
float v = max(v0, max(v1, max(v2, v3)));
for (uint n = 0; n < X.batch; ++n)
{
float v0 = X.Get(n, y*2, x*2, c);
float v1 = X.Get(n, y*2+1, x*2, c);
float v2 = X.Get(n, y*2, x*2+1, c);
float v3 = X.Get(n, y*2+1, x*2+1, c);
float v = max(v0, max(v1, max(v2, v3)));
O.Set(n, y, x, c, v);
}
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.batch == O.batch)
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.batch == O.batch)
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint y = 0; y < X.height; ++y)
for (uint x = 0; x < X.width; ++x)
v += X.Get(n, y, x, c);
v /= (X.height * X.width);
O.Set(n, 0, 0, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = 0;
for (uint y = 0; y < X.height; ++y)
for (uint x = 0; x < X.width; ++x)
v += X.Get(n, y, x, c);
v /= (X.height * X.width);
O.Set(n, 0, 0, c, v);
}
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
DISPATCH_ARGS(O.channels, 1, 1);
TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.shape == O.shape)
uint c = dispatchThreadID.x;
if (c >= O.channels) return;
//ASSERT(X.shape == O.shape)
float gamma = W.Get(0, 0, 0, c);
float beta = B.Get(0, 0, 0, c);
float gamma = W.Get(0, 0, 0, c);
float beta = B.Get(0, 0, 0, c);
for (uint n = 0; n < O.batch; ++n)
{
uint x, y;
// calc mean
float acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
acc += X.Get(n, y, x, c);
float mean = acc / (O.width * O.height);
for (uint n = 0; n < O.batch; ++n)
{
uint x, y;
// calc mean
float acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
acc += X.Get(n, y, x, c);
float mean = acc / (O.width * O.height);
// calc variance
acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float delta = X.Get(n, y, x, c) - mean;
acc += delta * delta;
}
float var = acc / (O.width * O.height);
// calc variance
acc = 0;
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float delta = X.Get(n, y, x, c) - mean;
acc += delta * delta;
}
float var = acc / (O.width * O.height);
// normalization factor
float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
// normalization factor
float invNormFactor = 1 / sqrt(var + FLT_EPSILON);
float scale = gamma * invNormFactor;
float bias = beta - gamma * mean * invNormFactor;
float scale = gamma * invNormFactor;
float bias = beta - gamma * mean * invNormFactor;
// apply normalization
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
}
// apply normalization
for (y = 0; y < O.height; ++y)
for (x = 0; x < O.width; ++x)
{
float v = X.Get(n, y, x, c);
v = v * scale + bias;
O.Set(n, y, x, c, v);
}
}
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(X.channels, X.width, X.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v);
}
}

44
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc


// Copyright: Copyleft 2012 :-)
float RandomUsingCos(float4 seed)
{
float4 K1 = float4( // Transcendental numbers:
0.64341054629, // (Cahen's constant)
23.14069263277926, // e^pi (Gelfond's constant)
2.665144142690225, // 2^sqrt(2) (Gelfond-Schneider constant)
3.14159265359 // pi
);
return frac(cos(dot(seed, K1)) * 12345.6789);
float4 K1 = float4( // Transcendental numbers:
0.64341054629, // (Cahen's constant)
23.14069263277926, // e^pi (Gelfond's constant)
2.665144142690225, // 2^sqrt(2) (Gelfond-Schneider constant)
3.14159265359 // pi
);
return frac(cos(dot(seed, K1)) * 12345.6789);
}
// Based on: https://stackoverflow.com/questions/4200224/random-noise-functions-for-glsl

// A single iteration of Bob Jenkins' One-At-A-Time hashing algorithm.
uint hash(uint x)
{
x += ( x << 10u );
x ^= ( x >> 6u );
x += ( x << 3u );
x ^= ( x >> 11u );
x += ( x << 15u );
return x;
x += ( x << 10u );
x ^= ( x >> 6u );
x += ( x << 3u );
x ^= ( x >> 11u );
x += ( x << 15u );
return x;
}
uint hash( uint2 v ) { return hash( v.x ^ hash(v.y) ); }
uint hash( uint3 v ) { return hash( v.x ^ hash(v.y) ^ hash(v.z) ); }

// All zeroes yields 0.0, all ones yields the next smallest representable value below 1.0.
float floatConstruct(uint m)
{
const uint ieeeMantissa = 0x007FFFFFu; // binary32 mantissa bitmask
const uint ieeeOne = 0x3F800000u; // 1.0 in IEEE binary32
const uint ieeeMantissa = 0x007FFFFFu; // binary32 mantissa bitmask
const uint ieeeOne = 0x3F800000u; // 1.0 in IEEE binary32
m &= ieeeMantissa; // Keep only mantissa bits (fractional part)
m |= ieeeOne; // Add fractional part to 1.0
m &= ieeeMantissa; // Keep only mantissa bits (fractional part)
m |= ieeeOne; // Add fractional part to 1.0
float f = asfloat(m); // Range [1:2]
return f - 1.0; // Range [0:1]
float f = asfloat(m); // Range [1:2]
return f - 1.0; // Range [0:1]
return floatConstruct(hash(asuint(seed)));
return floatConstruct(hash(asuint(seed)));
}

float Random(float4 seed)
{
return RandomUsingCos(seed);
return RandomUsingCos(seed);
return Random(seed) <= p ? 1: 0;
return Random(seed) <= p ? 1: 0;
}

480
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc


struct Tensor
{
// @TODO: actually uint seems not like a good idea anymore, consider going to int
uint batch, height, width, channels;
// @TODO: actually uint seems not like a good idea anymore, consider going to int
uint batch, height, width, channels;
void Init(uint4 nhwc)
{
batch = nhwc.x;
height = nhwc.y;
width = nhwc.z;
channels = nhwc.w;
}
void Init(uint4 nhwc)
{
batch = nhwc.x;
height = nhwc.y;
width = nhwc.z;
channels = nhwc.w;
}
uint4 Dims()
{
return uint4(batch, height, width, channels);
}
uint GetFlatHeight()
{
return batch;
}
uint GetFlatWidth()
{
return height * width * channels;
}
uint GetKernelHeight()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelHeight = batch;
return kernelHeight;
}
uint GetKernelWidth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelWidth = height;
return kernelWidth;
}
uint4 Dims()
{
return uint4(batch, height, width, channels);
}
uint GetFlatHeight()
{
return batch;
}
uint GetFlatWidth()
{
return height * width * channels;
}
uint GetKernelHeight()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelHeight = batch;
return kernelHeight;
}
uint GetKernelWidth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelWidth = height;
return kernelWidth;
}
uint GetKernelDepth()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelDepth = width;
return kernelDepth;
}
uint GetKernelCount()
{
// kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count}
uint kernelCount = channels;
return kernelCount;
}
uint GetLength()
{
return batch * height * width * channels;
}
uint Index(uint b, uint h, uint w, uint ch)
{
uint index =
b * height * width * channels +
h * width * channels +
w * channels +
ch;
return index;
}
uint Index(uint b, uint h, uint w, uint ch)
{
uint index =
b * height * width * channels +
h * width * channels +
w * channels +
ch;
return index;
}
uint Index(uint b, uint i)
{
uint index =
b * height * width * channels +
i;
return index;
}
uint Index(uint b, uint i)
{
uint index =
b * height * width * channels +
i;
return index;
}
StructuredBuffer<float> data;
StructuredBuffer<float> data;
void Init(uint4 nhwc, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
void Init(uint4 nhwc, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return data[Index(b, pos.y, pos.x, ch)];
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return data[Index(b, pos.y, pos.x, ch)];
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
RWStructuredBuffer<float> data;
RWStructuredBuffer<float> data;
void Init(int4 nhwc, RWStructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
void Init(int4 nhwc, RWStructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch)];
}
float Get(uint b, uint2 pos, uint ch)
{
return data[Index(b, pos.y, pos.x, ch)];
}
float Get(uint b, uint i)
{
return data[Index(b,i)];
}
float Get(uint i)
{
return data[i];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return Get(b, pos.y, pos.x, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return Get(b, pos.y, pos.x, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
void Set(uint b, uint h, uint w, uint ch, float v)
{
data[Index(b,h,w,ch)] = v;
}
void Set(uint y, uint x, float v)
{
data[Index(y,x)] = v;
}
void Set(uint i, float v)
{
data[i] = v;
}
void Set(uint b, uint h, uint w, uint ch, float v)
{
data[Index(b,h,w,ch)] = v;
}
void Set(uint y, uint x, float v)
{
data[Index(y,x)] = v;
}
void Set(uint i, float v)
{
data[i] = v;
}
StructuredBuffer<float> data;
uint offset;
StructuredBuffer<float> data;
uint offset;
void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
offset = info.x;
}
void Init(uint4 nhwc, uint4 info, StructuredBuffer<float> data_)
{
Tensor::Init(nhwc);
data = data_;
offset = info.x;
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch) + offset];
}
float Get(uint b, uint2 pos, uint ch)
{
return Get(b, pos.y, pos.x, ch);
}
float Get(uint b, uint i)
{
return data[Index(b,i) + offset];
}
float Get(uint i)
{
return data[i + offset];
}
float Get(uint b, uint h, uint w, uint ch)
{
return data[Index(b,h,w,ch) + offset];
}
float Get(uint b, uint2 pos, uint ch)
{
return Get(b, pos.y, pos.x, ch);
}
float Get(uint b, uint i)
{
return data[Index(b,i) + offset];
}
float Get(uint i)
{
return data[i + offset];
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float BroadcastGet(uint b, uint h, uint w, uint ch)
{
return Get(b % batch, h % height, w % width, ch % channels);
}
float BroadcastGet(uint b, uint2 pos, uint ch)
{
return BroadcastGet(b, pos.y, pos.x, ch);
}
float BroadcastGet(uint b, uint i)
{
return Get(b % GetFlatHeight(), i % GetFlatWidth());
}
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
float SafeGet(uint b, uint2 pos, uint ch, uint2 pad)
{
if (b >= batch || ch >= channels) return 0;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
if (any(pos < pad)) return 0;
if (any(pos >= uint2(width, height) + pad)) return 0;
pos -= pad;
return Get(b, pos, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
return Get(b, pos, ch);
}
float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad)
{
return SafeGet(b, uint2(w, h), ch, pad);
}
float SafeGet(uint b, uint i)
{
if (b >= batch || i >= height * width * channels) return 0;
return Get(b,i);
}
float SafeGet(uint i)
{
if (i >= batch * height * width * channels) return 0;
return Get(i);
}
};
#define TENSOR_DECL(X) uint4 X##decl[2]; StructuredBuffer<float> X##data;

float fastfma(float a, float b, float c)
{
return dot(float2(a,c), float2(b, 1));
return dot(float2(a,c), float2(b, 1));
}

112
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute


struct TextureAsTensor : Tensor
{
Texture2D<float4> tex;
SamplerState smp;
Texture2D<float4> tex;
SamplerState smp;
Texture2DArray<float4> texArray;
SamplerState smpArray;
Texture2DArray<float4> texArray;
SamplerState smpArray;
void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
{
Tensor::Init(nhwc);
tex = tex_;
smp = sampler_;
texArray = texArray_;
smpArray = samplerArray_;
}
void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
{
Tensor::Init(nhwc);
tex = tex_;
smp = sampler_;
texArray = texArray_;
smpArray = samplerArray_;
}
float4 Get(uint b, uint y, uint x)
{
float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
if (batch > 1)
return texArray.SampleLevel(smpArray, loc, 0);
else
return tex.SampleLevel(smp, loc.xy, 0);
}
float4 Get(uint b, uint y, uint x)
{
float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
if (batch > 1)
return texArray.SampleLevel(smpArray, loc, 0);
else
return tex.SampleLevel(smp, loc.xy, 0);
}
};
#define TENSOR_SHARED2_ARGS3(A, B, S, O) TENSOR_SHARED_ARG(A, S); TENSOR_SHARED_ARG(B, S); TENSOR_ARG_RW(O);

{
// @TODO: currently it fails to compile, needs to be investigated
#if 0
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
TENSOR_SHARED_ARG(K, WBK);
TENSOR_SHARED_ARG(B, WBK);
TENSOR_ARG_RW(O);
TENSOR_SHARED_ARG(K, WBK);
TENSOR_SHARED_ARG(B, WBK);
TENSOR_ARG_RW(O);
// ASSERT(X.channels <= MAX_CHANNELS)
// ASSERT(X.channels <= MAX_CHANNELS)
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint oy = y * _Stride.y + dy;
uint ox = x * _Stride.x + dx;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (oy < _Pad.y) continue;
if (oy - _Pad.w >= X.height) continue;
if (ox < _Pad.x) continue;
if (ox - _Pad.z >= X.width) continue;
float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
{
acc += in4channels[c] * K.Get(dy, dx, c, k);
}
}
}
float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
{
acc += in4channels[c] * K.Get(dy, dx, c, k);
}
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
#endif
}

57
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md


# Release notes
## 0.2.2
- Added support for --print-supported-ops flag for model converters, now it will print approximate list of supported operations. List of supported ops depends on converter.
- Added Keras converter as part of distribution.
- Now compute shaders are loaded only if GPU worker is requested.
- Fixed bug in MaxPool and AvgPool padding. Issue discovered by Yolo faces network.
- Fixed bug in Transpose convolution support for C# backend.
- Fixed TF model conversion with two LSTM cells.
- Fixed case when strided slice end overflows to zero and thus producing negative range.
## 0.2.1
- TF importer: fixed ResizeNearestNeighbor aka Upsample2D scaling factor detection.
- TF importer: optimized node sorting. Should be faster than 0.2.0.
- TF importer: made detection of actual output node from LSTM/GRU pattern more bullet proof by skipping Const nodes.
- TF importer: improved InstanceNormalization handling.
- TF importer: fixed SquareDifference pattern.
- TF importer: fixed Conv2DBackpropInput (transpose convolution) import.
- Fixed Conv2D performance regression on some GPUs.
- Fixed TextureAsTensorData.Download() to work properly with InterpretDepthAs.Channels.
- Fixed bug when identity/nop layers would reuse input as an output and later causing premature release of that tensor as part of intermediate data cleanup.
- Added scale + bias to TenstorToRenderTexture interface, usefull for adjusting network output scale + bias on the fly.
- Fixed double Dispose issue when worker gets garbage collected.
## 0.2.0
- Version bumped to 0.2.0 as it brings breaking API changes, for details look below.
- Significantly reduced temporary memory allocations by introducing internal allocator support. Now memory is re-used between layer execution as much as possible.
- Improved small workload performance on CSharp backend
- Added parallel implementation for multiple activation functions on CSharp backend
- Added `Peek()` function to `IWorker`, it retains object storage in worker's allocator, useful for quick grabbing of output. If you want to preserve content of output tensor between `Execute()` invocations, then use `Fetch()`.
- Fixed ESRGAN model conversion (ONNX importer).
- Fixed Tensor <-> Texture copy for textures/tensors that dimensions are not multiple of 8.
- Added `Summary()` method to `Worker`. Currently returns allocator information.
- Tabs to spaces! Aiming at higher salary (https://stackoverflow.blog/2017/06/15/developers-use-spaces-make-money-use-tabs/).
- Renamed worker type enum members: `CSharp` -> `CSharpRef`, `CSharpFast` -> `CSharp`, `Compute` -> `ComputeRef`, `ComputeFast` -> `Compute`.
- Implemented new optimized `ComputePrecompiled` worker. This worker caches Compute kernels and state beforehand to reduce CPU overhead.
- Added `ExecuteAsync()` to `IWorker` interface, it returns `IEnumerator`, which enables you to control how many layers to schedule per frame (one iteration == one layer).
- Added `Log` op support on Compute workers.
- Optimized activation functions and ScaleBias by accessing tensor as continuous array. Gained ~2.0ms on 4 batch MobileNet (MBP2016).
- Introduced _Loop version of activations to fight 65535 scheduling limit on D3D11.
- Added .nn as Barracuda model file extension for use in Unity Editor. Also added simple editor importer. Now you can declare serializable fields as NNModel to bind them to .nn asset. ModelLoader.Load() now accepts NNModel as a source.
- Compute: Reduce reference GPU implementation.
- TF importer: Expanded Mean support to mean over channels, implemented Pad (as Border2D), implemented SquaredDifference, added InstanceNormalization and LeakyRelu patterns, StridedSlice implementation.
- TF importer: sort model nodes by dependencies before processing.
- Fixed ComputeBuffer leak when using Compute and ComputePrecompiled backends.
- Made to use Conv2D_L1Cached64_RegisterBlock4x4 more often: improves perf ~2x on Vega 16, and ~30% on Nvidia and Intel.
## 0.1.6
- Added activation type print in verbose mode
- Added fast and parallel CPU implementation for Swish, Relu, Add, Sub, Div, Min, Max, Tanh, Exp

- Compatibility with ML Agents models: 3DBall, PushBlock, GridWorld, Soccer.
## 0.1.0
- First internal build. Due some bugs encountered wasn't published.
- First internal build. Due some bugs encountered wasn't published.
#Contributors
- Renaldas (ReJ) Zioma
- Mantas Puida
- Vladimir Oster
- Martin Sternevald
- Valdemar Bučilko
- Kuba Cupisz
- Povilas Kanapickas
- Paulius Puodžiūnas

2
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json


{
"name": "com.unity.barracuda",
"displayName": "Barracuda",
"version": "0.1.6-preview",
"version": "0.2.2-preview",
"unity": "2017.4",
"description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.",
"dependencies": {}

5
UnitySDK/Assets/ML-Agents/Scripts/Agent.cs


action.memories.AddRange(memories);
}
public List<float> GetMemoriesAction()
{
return action.memories;
}
/// <summary>
/// Updates the text action.
/// </summary>

31
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs


using System.Collections.Generic;
using System.Linq;
using MLAgents.InferenceBrain.Utils;
using UnityEngine;

public class BarracudaMemoryOutputApplier : TensorApplier.Applier
{
private bool firstHalf = true;
private int memoriesCount;
private int memoryIndex;
public BarracudaMemoryOutputApplier(bool firstHalf)
public BarracudaMemoryOutputApplier(int memoriesCount, int memoryIndex)
this.firstHalf = firstHalf;
this.memoriesCount = memoriesCount;
this.memoryIndex = memoryIndex;
}
public void Apply(Tensor tensor, Dictionary<Agent, AgentInfo> agentInfo)

var memorySize = tensor.Shape[tensor.Shape.Length - 1];
var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1];
var memory = new List<float>();
for (var j = 0; j < memorySize; j++)
{
memory.Add(tensorDataMemory[agentIndex, j]);
}
var memory = agent.GetMemoriesAction();
if (firstHalf)
if (memory == null || memory.Count < memorySize * memoriesCount)
agent.UpdateMemoriesAction(memory);
memory = new List<float>();
memory.AddRange(Enumerable.Repeat(0f, memorySize * memoriesCount));
else
for (var j = 0; j < memorySize; j++)
agent.AppendMemoriesAction(memory);
memory[memorySize * memoryIndex + j] = tensorDataMemory[agentIndex, j];
agent.UpdateMemoriesAction(memory);
agentIndex++;
}
}

23
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs


var memory = GetIntScalar(TensorNames.MemorySize);
if (memory > 0)
{
names.Add(TensorNames.RecurrentOutput_C);
names.Add(TensorNames.RecurrentOutput_H);
foreach (var mem in _model.memories)
{
names.Add(mem.output);
}
}
names.Sort();

// If the model has a non-negative memory size but requires a recurrent input
if (memory > 0)
{
if (!tensorsNames.Contains(TensorNames.RecurrentInPlaceholder_H) ||
!tensorsNames.Contains(TensorNames.RecurrentInPlaceholder_C))
if (!tensorsNames.Any(x => x.EndsWith("_h")) ||
!tensorsNames.Any(x => x.EndsWith("_c")))
{
_failedModelChecks.Add(
"The model does not contain a Recurrent Input Node but has memory_size.");

{
var memOutputs = _model.memories.Select(x => x.output).ToList();
if (!memOutputs.Contains(TensorNames.RecurrentOutput_H) ||
!memOutputs.Contains(TensorNames.RecurrentOutput_C))
if (!memOutputs.Any(x => x.EndsWith("_h")) ||
!memOutputs.Any(x => x.EndsWith("_c")))
{
_failedModelChecks.Add(
"The model does not contain a Recurrent Output Node but has memory_size.");

{TensorNames.RandomNormalEpsilonPlaceholder, ((tensor) => null)},
{TensorNames.ActionMaskPlaceholder, ((tensor) => null)},
{TensorNames.SequenceLengthPlaceholder, ((tensor) => null)},
{TensorNames.RecurrentInPlaceholder_H, ((tensor) => null)},
{TensorNames.RecurrentInPlaceholder_C, ((tensor) => null)},
{TensorNames.RecurrentInPlaceholder, ((tensor) => null)},
foreach (var mem in _model.memories)
tensorTester[mem.input] = ((tensor) => null);
for (var obsIndex = 0; obsIndex < _brainParameters.cameraResolutions.Length; obsIndex++)
{
var index = obsIndex;

var elementType = src.GetType().GetElementType();
var elementSize = Marshal.SizeOf(elementType);
var dest = Array.CreateInstance(elementType, shape);
Buffer.BlockCopy(src, 0, dest, 0, src.Length * elementSize);
Buffer.BlockCopy(src, 0, dest, 0, dest.Length * elementSize);
return dest;
}

22
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs


using System.Collections.Generic;
using System;
using System.Linq;
using Barracuda;
using MLAgents.InferenceBrain.Utils;
namespace MLAgents.InferenceBrain

public class BarracudaRecurrentInputGenerator : TensorGenerator.Generator
{
private bool firstHalf = true;
private int memoriesCount;
private int memoryIndex;
public BarracudaRecurrentInputGenerator(bool firstHalf)
public BarracudaRecurrentInputGenerator(int memoriesCount, int memoryIndex)
this.firstHalf = firstHalf;
this.memoriesCount = memoriesCount;
this.memoryIndex = memoryIndex;
var memorySize = tensor.Shape[tensor.Shape.Length - 1];
var memorySize = (int)tensor.Shape[tensor.Shape.Length - 1];
var memory = agentInfo[agent].memories;
var memory = agentInfo[agent].memories;
int offset = 0;
if (!firstHalf)
{
offset = memory.Count - (int)memorySize;
}
int offset = memorySize * memoryIndex;
if (memory == null)
{

var textures = agentInfo.Keys.Select(
agent => agentInfo[agent].visualObservations[_index]).ToList();
tensor.Data = Utilities.TextureToFloatArray(textures, _grayScale);
tensor.Shape[0] = textures.Count;
}
}
}

15
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs


using System.Collections.Generic;
#define ENABLE_BARRACUDA
using System.Collections.Generic;
namespace MLAgents.InferenceBrain
{

/// <param name="bp"> The BrainParameters used to determine what Appliers will be
/// used</param>
/// <param name="seed"> The seed the Appliers will be initialized with.</param>
public TensorApplier(BrainParameters bp, int seed)
public TensorApplier(BrainParameters bp, int seed, object barracudaModel = null)
{
_dict[TensorNames.ValueEstimateOutput] = new ValueEstimateApplier();
if (bp.vectorActionSpaceType == SpaceType.continuous)

}
_dict[TensorNames.RecurrentOutput] = new MemoryOutputApplier();
_dict[TensorNames.RecurrentOutput_C] = new BarracudaMemoryOutputApplier(true);
_dict[TensorNames.RecurrentOutput_H] = new BarracudaMemoryOutputApplier(false);
#if ENABLE_BARRACUDA
Barracuda.Model model = (Barracuda.Model) barracudaModel;
for (var i = 0; i < model?.memories.Length; i++)
{
_dict[model.memories[i].output] = new BarracudaMemoryOutputApplier(model.memories.Length, i);
}
#endif
}
/// <summary>

15
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs


using System.Collections.Generic;
#define ENABLE_BARRACUDA
using System.Collections.Generic;
using Barracuda;
namespace MLAgents.InferenceBrain
{

/// <param name="bp"> The BrainParameters used to determine what Generators will be
/// used</param>
/// <param name="seed"> The seed the Generators will be initialized with.</param>
public TensorGenerator(BrainParameters bp, int seed)
public TensorGenerator(BrainParameters bp, int seed, object barracudaModel = null)
{
// Generator for Inputs
_dict[TensorNames.BatchSizePlaceholder] = new BatchSizeGenerator();

_dict[TensorNames.RecurrentInPlaceholder_C] = new BarracudaRecurrentInputGenerator(true);
_dict[TensorNames.RecurrentInPlaceholder_H] = new BarracudaRecurrentInputGenerator(false);
#if ENABLE_BARRACUDA
Barracuda.Model model = (Barracuda.Model) barracudaModel;
for (var i = 0; i < model?.memories.Length; i++)
{
_dict[model.memories[i].input] = new BarracudaRecurrentInputGenerator(model.memories.Length, i);
}
#endif
_dict[TensorNames.PreviousActionPlaceholder] = new PreviousActionInputGenerator();
_dict[TensorNames.ActionMaskPlaceholder] = new ActionMaskInputGenerator();

11
UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs


_barracudaModel = ModelLoader.Load(model.Value);
var executionDevice = inferenceDevice == InferenceDevice.GPU
? BarracudaWorkerFactory.Type.ComputeFast
: BarracudaWorkerFactory.Type.CSharpFast;
? BarracudaWorkerFactory.Type.ComputePrecompiled
: BarracudaWorkerFactory.Type.CSharp;
_engine = BarracudaWorkerFactory.CreateWorker(executionDevice, _barracudaModel, _verbose);
}

_modelParamLoader = BarracudaModelParamLoader.GetLoaderAndCheck(_engine, _barracudaModel, brainParameters);
_inferenceInputs = _modelParamLoader.GetInputTensors();
_outputNames = _modelParamLoader.GetOutputNames();
_tensorGenerator = new TensorGenerator(brainParameters, seed);
_tensorApplier = new TensorApplier(brainParameters, seed);
_tensorGenerator = new TensorGenerator(brainParameters, seed, _barracudaModel);
_tensorApplier = new TensorApplier(brainParameters, seed, _barracudaModel);
#endif
}

var outputs = new List<Tensor>();
foreach (var name in names)
{
var outp = _engine.Fetch(name);
var outp = _engine.Peek(name);
outp.Dispose();
}
return outputs;

370
ml-agents/mlagents/trainers/barracuda.py


from collections import defaultdict
import numpy as np
import json
import struct # convert from Python values and C structs
import struct # convert from Python values and C structs
import re
import argparse
import os.path

self.globals = []
self.memories = []
def __init__(self, **entries):
self.__dict__.update(entries)
def __init__(self, **entries): self.__dict__.update(entries)
parser.add_argument("source_file", help=help)
parser.add_argument("target_file", help="output Barracuda binary file")
parser.add_argument("-trim", "--trim-unused-by-output")
parser.add_argument("--print-layers", action="store_true")
parser.add_argument("--print-source-json", action="store_true")
parser.add_argument("-json", "--print-barracuda-json", action="store_true")
parser.add_argument("--print-layer-links", action="store_true")
parser.add_argument("--print-patterns", action="store_true")
parser.add_argument("--print-tensors", action="store_true")
parser.add_argument("--verbose", action="store_true")
parser.add_argument('source_file', help=help)
parser.add_argument('target_file', help='output Barracuda binary file')
parser.add_argument('-trim', '--trim-unused-by-output')
parser.add_argument('--print-layers', action='store_true')
parser.add_argument('--print-source-json', action='store_true')
parser.add_argument('-json', '--print-barracuda-json', action='store_true')
parser.add_argument('--print-layer-links', action='store_true')
parser.add_argument('--print-patterns', action='store_true')
parser.add_argument('--print-tensors', action='store_true')
parser.add_argument('--print-supported-ops', action='store_true')
parser.add_argument('--verbose', action='store_true')
args.compress_f16 = (
False
) # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
output_extension = ".bc" if not args.compress_f16 else ".f16.bc"
args.compress_f16 = False # TEMP: disabled, until properly implemented parser.add_argument('-f16', '--compress-f16', action='store_true')
output_extension = '.bc' if not args.compress_f16 else '.f16.bc'
print("File", args.source_file, "does not exist.")
print('File', args.source_file, 'does not exist.')
return os.path.splitext(os.path.basename(filename))[0] + newExtenstion
return os.path.splitext(os.path.basename(filename))[0] + newExtenstion;
args.target_file = os.path.join(
args.target_file,
replaceFilenameExtension(args.source_file, output_extension),
)
args.target_file = os.path.join(args.target_file, replaceFilenameExtension(args.source_file, output_extension))
# Fuse training time BatchNorm tensors into Scale & Bias
def fuse_batchnorm_weights(gamma, beta, mean, var, epsilon):

bias = beta - gamma * mean / np.sqrt(var + epsilon)
return [scale, bias]
if hasattr(model, "layers"):
if hasattr(model, 'layers'):
model = model.layers
inputs_and_memories = set(list(inputs) + list(memories[1::3]))

ready.add(l.name)
return missing
# Class to represent a graph
# Class to represent a graph
class Graph:
def __init__(self, vertices):
self.graph = defaultdict(list) # dictionary containing adjacency List
self.V = vertices # No. of vertices
# function to add an edge to graph
def addEdge(self, u, v):
self.graph[u].append(v)
# A recursive function used by topologicalSort
def topologicalSortUtil(self, v, visited, stack):
# Mark the current node as visited.
class Graph:
def __init__(self,vertices):
self.graph = defaultdict(list) #dictionary containing adjacency List
self.V = vertices #No. of vertices
# function to add an edge to graph
def addEdge(self,u,v):
self.graph[u].append(v)
# A recursive function used by topologicalSort
def topologicalSortUtil(self,v,visited,stack):
# Mark the current node as visited.
# Recur for all the vertices adjacent to this vertex
for i in self.graph[v]:
if visited[i] == False:
self.topologicalSortUtil(i,visited,stack)
# Push current vertex to stack which stores result
stack.insert(0,v)
# Recur for all the vertices adjacent to this vertex
for i in self.graph[v]:
if visited[i] == False:
self.topologicalSortUtil(i, visited, stack)
# Push current vertex to stack which stores result
stack.insert(0, v)
# The function to do Topological Sort. It uses recursive
# topologicalSortUtil()
def topologicalSort(self):
# Mark all the vertices as not visited
visited = [False] * self.V
stack = []
# Call the recursive helper function to store Topological
# Sort starting from all vertices one by one
for i in range(self.V):
if visited[i] == False:
self.topologicalSortUtil(i, visited, stack)
# print(stack)
# The function to do Topological Sort. It uses recursive
# topologicalSortUtil()
def topologicalSort(self):
# Mark all the vertices as not visited
visited = [False]*self.V
stack =[]
# Call the recursive helper function to store Topological
# Sort starting from all vertices one by one
for i in range(self.V):
if visited[i] == False:
self.topologicalSortUtil(i,visited,stack)
#print(stack)
if len(find_missing_inputs(model, inputs_and_memories)) == 0:
if (len(find_missing_inputs(model, inputs_and_memories)) == 0):
return model
g = Graph(len(model))

for l in model:
layers[l.name] = id
layers[l.name] = id;
id += 1
for layer in model:

print("SORTED:", sorted_layer_indices)
new_model = [model[idx] for idx in sorted_layer_indices]
assert len(find_missing_inputs(new_model, inputs_and_memories)) == 0
assert(len(find_missing_inputs(new_model, inputs_and_memories)) == 0)
if hasattr(model, "layers"):
if hasattr(model, 'layers'):
def flatten(items, enter=lambda x: isinstance(x, list)):
def flatten(items,enter=lambda x:isinstance(x, list)):
# http://stackoverflow.com/a/40857703
# https://github.com/ctmakro/canton/blob/master/canton/misc.py
"""Yield items from any nested iterable; see REF."""

yield x
def trim_model(model, outputs):
layers = {l.name: l for l in model}
layers = {l.name:l for l in model}
connected = {o for o in outputs}
while len(outputs) > 0:
outputs = set(flatten([layers[o].inputs for o in outputs if o in layers]))

connected.add(o)
trimmed = [l.name for l in model if l.name not in connected]
return str(arr)[1:-1] # array to string without brackets
return str(arr)[1:-1] # array to string without brackets
print("TRIMMED:", array_without_brackets(trimmed))
return [l for l in model if l.name in connected]

print("Trimming model given outputs to preserve:", preserve_outputs)
model = trim_model(model, preserve_outputs)
else:
print(
"WARNING: Trim couldn't find any layers to match:", criteria_regexp_string
)
print("WARNING: Trim couldn't find any layers to match:", criteria_regexp_string)
# Fuse
def fuse(model, verbose):
i = 0
while i < len(model) - 1:
if model[i].type == model[i+1].type and model[i].type == 255: # Load
model[i].tensors += model[i+1].tensors
del model[i+1]
else:
i += 1
return model
compress_classes = {"Dense"}
compress_classes = {
'Dense'
}
if l.class_name in compress_classes:
print(
"Compressing %s layer '%s' weights to float16" % (l.class_name, l.name)
)
if (l.class_name in compress_classes):
print("Compressing %s layer '%s' weights to float16" % (l.class_name, l.name))
if isinstance(o, np.ndarray): # skip binary data packed inside ndarray
if isinstance(o, np.ndarray): # skip binary data packed inside ndarray
if getattr(o, "__dict__", None):
if getattr(o, '__dict__', None):
s = json.dumps(model.layers, cls=StructEncoder, separators=(", ", ":"))
s = json.dumps(model.layers, cls=StructEncoder, separators=(', ',':'))
s = s.replace("]}, {", "]},\n{")
s = s.replace(":[{", ":[\n\t{")
s = s.replace("}, {", "},\n\t{")
s = s.replace(']}, {', ']},\n{')
s = s.replace(':[{', ':[\n\t{')
s = s.replace('}, {', '},\n\t{')
return str(arr)[1:-1] # array to string without brackets
return str(arr)[1:-1] # array to string without brackets
if print_layer_links:
for l in model.layers:

if model.globals:
if isinstance(model.globals, dict):
model.globals = {x.name: x.shape for x in model.globals}
model.globals = {x.name:x.shape for x in model.globals}
ins = {i: model.inputs[i] for i in l.inputs if i in model.inputs}
ins = {i:model.inputs[i] for i in l.inputs if i in model.inputs}
else:
ins = [i for i in l.inputs if i in model.inputs]
if ins:

print("OUT:", array_without_brackets(model.outputs))
if print_tensors:
if (print_tensors):
def __init__(self, scope=""):
def __init__(self, scope=''):
if attr == "_":
if attr == '_':
return self.layers[-1].name if len(self.layer) > 0 else self.scope
raise AttributeError(attr)

i = 1
while name in self.names_taken:
name = self.layers[-1].op + "_" + str(i)
name = self.layers[-1].op + '_' + str(i)
self.layers[-1].name = self.scope + ("/" if self.scope else "") + name
self.layers[-1].name = self.scope + ('/' if self.scope else '') + name
def concat(self, a, b, out=""):
self.layers += [Struct(name=out, op="Concat", input=[a, b])]
def concat(self, a, b, axis=-1, out=''):
self.layers += [Struct(name=out, op='Concat', axis=axis, input=[a, b])]
def mad(self, x, kernel, bias, out=""):
self.layers += [Struct(name=out, op="Dense", input=[x, kernel, bias])]
def mad(self, x, kernel, bias, out=''):
self.layers += [Struct(name=out, op='Dense', input=[x, kernel, bias])]
return self._patch_last_layer_name_and_return()
def mul(self, a, b, out=''):
self.layers += [Struct(name=out, op='Mul', input=[a, b])]
return self._patch_last_layer_name_and_return()
def add(self, a, b, out=''):
self.layers += [Struct(name=out, op='Add', input=[a, b])]
return self._patch_last_layer_name_and_return()
def sub(self, a, b, out=''):
self.layers += [Struct(name=out, op='Sub', input=[a, b])]
def mul(self, a, b, out=""):
self.layers += [Struct(name=out, op="Mul", input=[a, b])]
def sigmoid(self, x, out=''):
self.layers += [Struct(name=out, op='Sigmoid', input=[x])]
def add(self, a, b, out=""):
self.layers += [Struct(name=out, op="Add", input=[a, b])]
def tanh(self, x, out=''):
self.layers += [Struct(name=out, op='Tanh', input=[x])]
def sub(self, a, b, out=""):
self.layers += [Struct(name=out, op="Sub", input=[a, b])]
def reduce(self, op, x, axis=-1, out=''):
self.layers += [Struct(name=out, op='Reduce'+op, axis=axis, input=[x])]
def sigmoid(self, x, out=""):
self.layers += [Struct(name=out, op="Sigmoid", input=[x])]
def pool(self, op, x, out=''):
self.layers += [Struct(name=out, op=op+'Pool', input=[x])]
def tanh(self, x, out=""):
self.layers += [Struct(name=out, op="Tanh", input=[x])]
def strided_slice(self, x, begin, end, strides, rank, out=''):
self.layers += [Struct(name=out, op='StridedSlice', rank=rank, starts=begin, ends=end, slice_strides=strides, input=[x])]
def mean(name, input, axis=-1):
''' combines mean operation out of several simpler ops
'''
nn = Build(name)
if np.array_equal(axis, [1,2]):
nn.pool('GlobalAvg', input, out=name)
elif np.array_equal(axis, [1,2,3]):
nn.reduce('Mean', # over channels
nn.pool('GlobalAvg', input), # over height & width
out=name)
elif np.array_equal(axis, [3]) or np.array_equal(axis, [-1]) or np.array_equal(axis, 3) or np.array_equal(axis, -1):
nn.reduce('Mean', input, out=name)
return nn.layers
def rnn(name, input, state, kernel, bias, new_state, number_of_gates=2):
""" - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
"""
def rnn(name, input, state, kernel, bias, new_state, number_of_gates = 2):
''' - Ht = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
'''
nn.tanh(nn.mad(kernel=kernel, bias=bias, x=nn.concat(input, state)), out=new_state)
nn.tanh(
nn.mad(kernel=kernel, bias=bias,
x=nn.concat(input, state)),
out=new_state);
def gru(
name,
input,
state,
kernel_r,
kernel_u,
kernel_c,
bias_r,
bias_u,
bias_c,
new_state,
number_of_gates=2,
):
""" - zt = f(Xt*Wz + Ht_1*Rz + Wbz + Rbz)
def gru(name, input, state, kernel_r, kernel_u, kernel_c, bias_r, bias_u, bias_c, new_state, number_of_gates = 2):
''' - zt = f(Xt*Wz + Ht_1*Rz + Wbz + Rbz)
"""
'''
nn = Build(name)
inputs = nn.concat(input, state)

c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c, x=nn.concat(input, r_state)))
c = nn.tanh(nn.mad(kernel=kernel_c, bias=bias_c,
x=nn.concat(input, r_state)))
# new_h = u' * state + (1 - u') * c'
# = u' * state + c' - u' * c'

# - u' * c'
nn.sub(nn._, nn.mul(u, c), out=new_state)
return nn.layers
nn.sub(nn._, nn.mul(u, c),
out=new_state)
return nn.layers;
def lstm(
name,
input,
state_c,
state_h,
kernel_i,
kernel_j,
kernel_f,
kernel_o,
bias_i,
bias_j,
bias_f,
bias_o,
new_state_c,
new_state_h,
):
""" Full:
def lstm(name, input, state_c, state_h, kernel_i, kernel_j, kernel_f, kernel_o, bias_i, bias_j, bias_f, bias_o, new_state_c, new_state_h):
''' Full:
- it = f(Xt*Wi + Ht_1*Ri + Pi . Ct_1 + Wbi + Rbi)
- ft = f(Xt*Wf + Ht_1*Rf + Pf . Ct_1 + Wbf + Rbf)
- ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)

"""
'''
""" No peephole:
''' No peephole:
- it = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi)
- ft = f(Xt*Wf + Ht_1*Rf + Wbf + Rbf)
- ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc)

"""
'''
j = nn.tanh(nn.mad(inputs, kernel_j, bias_j))
j = nn.tanh(nn.mad(inputs, kernel_j, bias_j))
nn.add(nn.mul(state_c, f), nn.mul(i, j), out=new_state_c)
nn.add(
nn.mul(state_c, f), nn.mul(i, j),
out=new_state_c)
# new_h =
nn.mul(o, nn.tanh(new_state_c), out=new_state_h)
# new_h =
nn.mul(o, nn.tanh(new_state_c),
out=new_state_h)
# Serialize
class BarracudaWriter:

self.f = open(filename, "wb+")
self.f = open(filename, 'wb+')
def __enter__(self):
return self

def write_str(self, s):
self.write_int32(len(s))
self.f.write(s.encode("ascii"))
self.f.write(s.encode('ascii'))
self.f.write(struct.pack("<f", d))
self.f.write(struct.pack('<f', d))
self.f.write(struct.pack("<i", d))
self.f.write(struct.pack('<i', d))
self.f.write(struct.pack("<q", d))
self.f.write(struct.pack('<q', d))
def write_shape(self, s):
self.write_int32(len(s))

def close(self):
self.f.close()
# VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
#VERSION = 0xBA22AC0DA000 + BARRACUDA_VERSION
w.write_int64(BARRACUDA_VERSION)
# inputs

w.write_str_array(model.outputs)
# memories
w.write_int32(len(model.memories) // 3)
for mem_shape, mem_in, mem_out in zip(
model.memories[0::3], model.memories[1::3], model.memories[2::3]
):
w.write_int32(len(model.memories)//3)
for mem_shape, mem_in, mem_out in zip(model.memories[0::3], model.memories[1::3], model.memories[2::3]):
w.write_shape(mem_shape)
w.write_str(mem_in)
w.write_str(mem_out)

w.write_int32(len(model.layers))
for l in model.layers:
assert not l.name in l.inputs
assert(not l.name in l.inputs)
w.write_int32(0) # dummy
w.write_int32(0) # dummy
w.write_int32(0) #dummy
w.write_int32(0) #dummy
w.write_shape(l.pads)
w.write_shape(l.strides)
w.write_shape(l.pool_size)

w.write_int32(0) # dummy
w.write_int32(0) #dummy
assert len(x.shape) == 4
assert x.data.nbytes % 4 == 0
length = (
x.data.nbytes >> 2
) # length is measured in float32s (at least for now)
assert(len(x.shape) == 4)
assert(x.data.nbytes % 4 == 0)
length = x.data.nbytes >> 2 # length is measured in float32s (at least for now)
w.write_str(x.name)
w.write_shape(x.shape)

for x in all_tensors:
w.write_array(x.data)
def print_known_operations(known_classes, known_activations):
print('OPS supported by the converter:')
for key in sorted(known_classes.keys()):
print(key)
print('ACTIVATIONS supported by the converter:')
for key in sorted(known_activations.keys()):
print(key)

926
ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
文件差异内容过多而无法显示
查看文件

2
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta


fileFormatVersion: 2
guid: 83221ad3db87f4b3b91b041047cb2bc5
guid: 19ed1486aa27d4903b34839f37b8f69f
MonoImporter:
externalObjects: {}
serializedVersion: 2

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta


fileFormatVersion: 2
guid: 4b10c58689ee84c2abe895327686f532
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta


fileFormatVersion: 2
guid: e192a80b369ad4683a329432eeb5ec20
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef


{
"name": "Barracuda-editor",
"references": [],
"includePlatforms": [
"Editor"
],
"excludePlatforms": []
}

7
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta


fileFormatVersion: 2
guid: 9f1e7d835703842dda0e25142ed6c3c9
AssemblyDefinitionImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

8
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png

之前 之后
宽度: 64  |  高度: 64  |  大小: 2.3 KiB

106
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta


fileFormatVersion: 2
guid: 8682ff569c4c7457a8a8e3a527aad537
TextureImporter:
fileIDToRecycleName: {}
externalObjects: {}
serializedVersion: 4
mipmaps:
mipMapMode: 0
enableMipMap: 0
sRGBTexture: 0
linearTexture: 0
fadeOut: 0
borderMipMap: 0
mipMapsPreserveCoverage: 0
alphaTestReferenceValue: 0.5
mipMapFadeDistanceStart: 1
mipMapFadeDistanceEnd: 3
bumpmap:
convertToNormalMap: 0
externalNormalMap: 0
heightScale: 0.25
normalMapFilter: 0
isReadable: 0
grayScaleToAlpha: 0
generateCubemap: 6
cubemapConvolution: 0
seamlessCubemap: 0
textureFormat: 1
maxTextureSize: 2048
textureSettings:
serializedVersion: 2
filterMode: -1
aniso: 1
mipBias: -1
wrapU: 1
wrapV: 1
wrapW: -1
nPOTScale: 0
lightmap: 0
compressionQuality: 50
spriteMode: 0
spriteExtrude: 1
spriteMeshType: 1
alignment: 0
spritePivot: {x: 0.5, y: 0.5}
spritePixelsToUnits: 100
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
spriteGenerateFallbackPhysicsShape: 1
alphaUsage: 1
alphaIsTransparency: 1
spriteTessellationDetail: -1
textureType: 2
textureShape: 1
maxTextureSizeSet: 0
compressionQualitySet: 0
textureFormatSet: 0
platformSettings:
- buildTarget: DefaultTexturePlatform
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Standalone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: iPhone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Android
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
spriteSheet:
serializedVersion: 2
sprites: []
outline: []
physicsShape: []
spritePackingTag:
userData:
assetBundleName:
assetBundleVariant:

42
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs


using System.IO;
using UnityEditor;
using UnityEngine;
using UnityEditor.Experimental.AssetImporters;
namespace Barracuda
{
/// <summary>
/// Asset Importer of barracuda models.
/// </summary>
[ScriptedImporter(1, new[] {"nn"})]
public class NNModelImporter : ScriptedImporter {
private const string iconName = "NNModelIcon";
private Texture2D iconTexture;
public override void OnImportAsset(AssetImportContext ctx)
{
var model = File.ReadAllBytes(ctx.assetPath);
var asset = ScriptableObject.CreateInstance<NNModel>();
asset.Value = model;
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
ctx.SetMainObject(asset);
}
private Texture2D LoadIconTexture()
{
if (iconTexture == null)
{
string[] allCandidates = AssetDatabase.FindAssets(iconName);
if (allCandidates.Length > 0)
{
iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
}
}
return iconTexture;
}
}
}

29
UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs


using System.IO;
using UnityEditor;
using UnityEngine;
using UnityEditor.Experimental.AssetImporters;
using MLAgents.InferenceBrain;
namespace MLAgents
{
/// <summary>
/// Asset Importer of barracuda models.
/// </summary>
[ScriptedImporter(1, new[] {"nn"})]
public class NNModelImporter : ScriptedImporter {
private const string IconPath = "Assets/ML-Agents/Resources/NNModelIcon.png";
public override void OnImportAsset(AssetImportContext ctx)
{
var model = File.ReadAllBytes(ctx.assetPath);
var asset = ScriptableObject.CreateInstance<NNModel>();
asset.Value = model;
Texture2D texture = (Texture2D)
AssetDatabase.LoadAssetAtPath(IconPath, typeof(Texture2D));
ctx.AddObjectToAsset(ctx.assetPath, asset, texture);
ctx.SetMainObject(asset);
}
}
}

8
UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png

之前 之后

106
UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png.meta


fileFormatVersion: 2
guid: 8682ff569c4c7457a8a8e3a527aad537
TextureImporter:
fileIDToRecycleName: {}
externalObjects: {}
serializedVersion: 4
mipmaps:
mipMapMode: 0
enableMipMap: 0
sRGBTexture: 0
linearTexture: 0
fadeOut: 0
borderMipMap: 0
mipMapsPreserveCoverage: 0
alphaTestReferenceValue: 0.5
mipMapFadeDistanceStart: 1
mipMapFadeDistanceEnd: 3
bumpmap:
convertToNormalMap: 0
externalNormalMap: 0
heightScale: 0.25
normalMapFilter: 0
isReadable: 0
grayScaleToAlpha: 0
generateCubemap: 6
cubemapConvolution: 0
seamlessCubemap: 0
textureFormat: 1
maxTextureSize: 2048
textureSettings:
serializedVersion: 2
filterMode: -1
aniso: 1
mipBias: -1
wrapU: 1
wrapV: 1
wrapW: -1
nPOTScale: 0
lightmap: 0
compressionQuality: 50
spriteMode: 0
spriteExtrude: 1
spriteMeshType: 1
alignment: 0
spritePivot: {x: 0.5, y: 0.5}
spritePixelsToUnits: 100
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
spriteGenerateFallbackPhysicsShape: 1
alphaUsage: 1
alphaIsTransparency: 1
spriteTessellationDetail: -1
textureType: 2
textureShape: 1
maxTextureSizeSet: 0
compressionQualitySet: 0
textureFormatSet: 0
platformSettings:
- buildTarget: DefaultTexturePlatform
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Standalone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: iPhone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Android
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
spriteSheet:
serializedVersion: 2
sprites: []
outline: []
physicsShape: []
spritePackingTag:
userData:
assetBundleName:
assetBundleVariant:

10
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs


using UnityEngine;
namespace MLAgents.InferenceBrain
{
public class NNModel : ScriptableObject
{
[HideInInspector]
public byte[] Value;
}
}

11
UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta


fileFormatVersion: 2
guid: fb1293e6d636b46d09ae35b36241a0c6
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

/UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs.meta → /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

正在加载...
取消
保存