浏览代码

Merge pull request #2049 from Unity-Technologies/develop-barracuda-0.2.0

Barracuda 0.2.1 -> develop
/develop-generalizationTraining-TrainerController
GitHub 6 年前
当前提交
f13d0f11
共有 62 个文件被更改,包括 4891 次插入4000 次删除
  1. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
  2. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
  3. 2
      UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
  4. 2
      UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
  5. 2
      UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
  6. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
  7. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
  8. 2
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
  9. 2
      UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
  10. 2
      UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
  11. 2
      UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
  12. 2
      UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
  13. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
  14. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
  15. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
  16. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
  17. 2
      UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
  18. 2
      UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
  19. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
  20. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
  21. 111
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
  22. 1000
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
  23. 918
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
  24. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
  25. 68
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
  26. 596
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
  27. 632
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
  28. 438
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
  29. 30
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute
  30. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
  31. 214
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
  32. 484
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
  33. 44
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
  34. 480
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc
  35. 112
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute
  36. 57
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
  37. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
  38. 5
      UnitySDK/Assets/ML-Agents/Scripts/Agent.cs
  39. 31
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/ApplierImpl.cs
  40. 23
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
  41. 22
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/GeneratorImpl.cs
  42. 15
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorApplier.cs
  43. 15
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/TensorGenerator.cs
  44. 11
      UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
  45. 370
      ml-agents/mlagents/trainers/barracuda.py
  46. 926
      ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
  47. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
  48. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
  49. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
  50. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
  51. 7
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
  52. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
  53. 106
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
  54. 42
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
  55. 29
      UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
  56. 8
      UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png
  57. 106
      UnitySDK/Assets/ML-Agents/Resources/NNModelIcon.png.meta
  58. 10
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
  59. 11
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
  60. 0
      /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.10980392, g: 0.6039216, b: 1, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.5, g: 0.5, b: 0.5, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

111
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md


### Load Model into Barracuda
Once you have your TensorFlow (or ONNX) model converted, you can load resulting Barracuda file via `ModelLoader`:
```C#
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".bytes");
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".nn");
```
Another option is to use editor model importer. Just add public `NNModel` field to your C# script and assing ``.nn`` model file via editor UI:
```C#
public NNModel modelSource;
<..>
var model = ModelLoader.Load(modelSource);
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputeFast, model)
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputePrecompiled, model)
```
### Execute the model

Execution is asynchronous for GPU backends. Currently implementation is synchronous for CPU backends, however it is good to assume that execution will be async for all backends in the future.
### Fetch outputs
If model has only single output, then simple `worker.Fetch()` can be used, otherwise output names should be provided.
If model has only single output, then simple `worker.Peek()` can be used, otherwise output names should be provided.
var O = worker.Fetch(outputName);
var O = worker.Peek(outputName);
_Note:_ ``Peek()`` does not take ownership of the tensor. If you expect to keep tensor for longer time use ``Fetch()``
### Cleanup
As a Barracuda client you are responsible to `Dispose` _worker_, _inputs_ and _outputs_ you fetched. This is necessary to properly free GPU resources.

### Texture as output
If you want to use Barracuda execution results further in the graphics pipeline, you can copy data from `Tensor` into `RenderTexture` without stalling CPU or GPU:
```C#
var tensor = worker.Fetch();
var tensor = worker.Peek();
var texture = BarracudaTextureUtils.TensorToRenderTexture(tensor);
```
If you wish, you can reuse the same `RenderTexture` multiple times:

var tensor = worker.Fetch();
var tensor = worker.Peek();
BarracudaTextureUtils.TensorToRenderTexture(tensor, texture);
```

Convert from TensorFlow:
```bash
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.bytes
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.nn
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.bytes
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.nn
```
If network has multiple outputs, but you need only particular ones during the inference, there is an optional `-trim` flag to remove unused outputs and calculations.

Trim will first remove outputs that do not match regular expression from the graph. In this case only output that ends with `action` will be left.
Next trim will strip all nodes that do not participate in the evaluation of the output.
You could pass `--print-supported-ops` to get approximate list of supported operations/activations for specific converter.
P.S. Python 3.5 or 3.6 is recommended
## Approximate list of supported layers/operations for TensorFlow converter
```
Activation
Add
AvgPool
BatchNormalization
BatchNormalizationRuntime
BiasAdd
Concat
Conv2D
Conv2DBackpropInput
Dense
DepthwiseConv2dNative
Flatten
FusedBatchNorm
GlobalAveragePool
GlobalAvgPool
InstanceNormalization
LRN
MatMul
Max
MaxPool
Maximum
Mean
Min
Minimum
Mul
Multinomial
Nop
OneHot
Pad
Pow
Prod
RandomStandardNormal
RandomUniform
RealDiv
Reshape
ResizeBicubic
ResizeBilinear
ResizeNearestNeighbor
StridedSlice
Sub
Sum
```
## Approximate list of supported activations for TensorFlow converter
```
Abs
Acos
Acosh
Asin
Asinh
Atan
Atanh
Ceil
Cos
Cosh
Elu
Exp
Floor
LeakyRelu
Linear
Log
LogSoftmax
Neg
Relu
Relu6
Selu
Sigmoid
Sin
Sinh
Softmax
Softplus
Softsign
Sqrt
Swish
Tan
Tanh
```
P.S. some of these operations are under limited support and not all configurations are properly supported
P.P.S. We plan to migrate Tensorflow and ONNX converters from Python to C# in the future.
P.P.S. Python 3.5 or 3.6 is recommended
P.P.P.S. We plan to migrate Tensorflow and ONNX converters from Python to C# in the future.

1000
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
文件差异内容过多而无法显示
查看文件

918
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute


#pragma kernel Relu_Flat
#pragma kernel Relu_Loop
#pragma kernel Relu6_Flat
#pragma kernel Relu6_Loop
#pragma kernel Tanh_Flat
#pragma kernel Tanh_Loop
#pragma kernel Swish_Flat
#pragma kernel Swish_Loop
#pragma kernel Sigmoid_Flat
#pragma kernel Sigmoid_Loop
#pragma kernel Elu_Flat
#pragma kernel Elu_Loop
#pragma kernel LeakyRelu_Flat
#pragma kernel LeakyRelu_Loop
#pragma kernel Exp_Flat
#pragma kernel Exp_Loop
#pragma kernel Log_Flat
#pragma kernel Log_Loop
#pragma kernel Pow_Flat
#pragma kernel Pow_Loop
/*
Relu_Flat (NEW) vs Relu_Nyxc+Relu_CNyx+Relu
Compute Precompiled
VGG@1
<<<Exec #128: 59.6 ms, cpu: .9 ms, avg: 62.4 ms, result:OK <--- NEW!
<<<Exec #128: 63.6 ms, cpu: .9 ms, avg: 64.0 ms, result:OK
VGG@4
<<<Exec #16: 276.7 ms, cpu: .9 ms, avg: 272.8 ms, result:OK <--- NEW!
<<<Exec #16: 297.5 ms, cpu: .9 ms, avg: 274.4 ms, result:OK
RES@1
<<<Exec #100: 82.2 ms, cpu: 22.2 ms, avg: 81.0 ms, result:OK <--- NEW!
<<<Exec #100: 82.1 ms, cpu: 22.5 ms, avg: 85.4 ms, result:OK
PPO_2@256
<<<Exec #200: 10.3 ms, cpu: 7.6 ms, avg: 11.9 ms, result:OK <--- NEW!
<<<Exec #200: 10.9 ms, cpu: 8.3 ms, avg: 12.3 ms, result:OK
PPO_CNN@256
<<<Exec #100: 60.6 ms, cpu: 62.3 ms, avg: 65.6 ms, result:OK <--- NEW!
<<<Exec #100: 72.6 ms, cpu: 62.7 ms, avg: 66.0 ms, result:OK
*/
#pragma kernel Relu
#pragma kernel Relu_CNyx
#pragma kernel Relu_Nyxc

#pragma kernel Exp
#pragma kernel Exp_CNyx
#pragma kernel Exp_Nyxc
#pragma kernel Log
#pragma kernel Log_CNyx
#pragma kernel Log_Nyxc
#pragma kernel Pow
#pragma kernel Pow_CNyx
#pragma kernel Pow_Nyxc

TENSOR_DECL_RW(O)
float _Alpha;
uint _LoopStride;
#define FLAT_ACTIVATION(name, op_name) \
void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
if (i > O.GetLength()) return;\
\
float v = X.Get(i);\
v = op_name (v);\
O.Set(i, v);\
}
#define LOOP_ACTIVATION(name, op_name) \
void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
uint len = O.GetLength();\
\
while (i < len) {\
float v = X.Get(i); \
v = op_name (v); \
O.Set(i, v); \
i += _LoopStride; \
}\
}
#define ACTIVATION(name, op_name) \
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
FLAT_ACTIVATION(name, op_name)\
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
LOOP_ACTIVATION(name, op_name)
return 0.5f * (v + abs(v));
return 0.5f * (v + abs(v));
return min(max(0, v), 6);
return min(max(0, v), 6);
return v / (1.f + exp(-v));
return v / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
return max(v, _Alpha * v);
return max(v, _Alpha * v);
float signed_pow(float f, float e)
float signed_pow(float f)
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
float e = _Alpha;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
ACTIVATION(Relu, relu)
ACTIVATION(Relu6, relu6)
ACTIVATION(Tanh, tanh)
ACTIVATION(Sigmoid, sigmoid)
ACTIVATION(Swish, swish)
ACTIVATION(Elu, elu)
ACTIVATION(LeakyRelu, lrelu)
ACTIVATION(Exp, exp)
ACTIVATION(Log, log)
ACTIVATION(Pow, signed_pow)
// -------------------
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void Log(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((16,16,1), (16,8,1), (16,4,1))
void Log_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))
void Log_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}

DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
文件差异内容过多而无法显示
查看文件

68
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute


NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastAdd(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastSub(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMul(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastDiv(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

float signed_pow(float f, float e)
{
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMin(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMax(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

596
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute


#pragma kernel Conv2D
#pragma kernel Conv2D_RegisterBlock4x2
//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached32_RegisterBlock4x4
#pragma kernel DepthwiseConv2D

NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
}

void Conv2D_RegisterBlock4x2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
#undef L1CACHESIZE
#define L1CACHESIZE 64
#undef SIZE
#define SIZE 4
groupshared float Conv2D_L1Cached64_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];
[numthreads(L1CACHESIZE, 1, 1)]
void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
#define CONV2D_L1CACHED(L1CACHESIZE, SIZE, FMA) \
groupshared float Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];\
[numthreads(L1CACHESIZE, 1, 1)]\
void Conv2D_L1Cached##L1CACHESIZE##_RegisterBlock##SIZE##x##SIZE(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)\
{\
DISPATCH_ARGS(K.kernelCount, O.width, O.height);\
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);\
\
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;\
uint x = groupID.y;\
uint y = groupID.z;\
\
if (x*SIZE >= O.width) return;\
if (y*SIZE >= O.height) return;\
\
for (uint n = 0; n < O.batch; ++n)\
{\
float acc[SIZE*SIZE];\
[unroll]\
for (uint q = 0; q < SIZE*SIZE; ++q)\
acc[q] = B.SafeGet(k);\
\
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)\
{\
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)\
{\
uint2 pos[SIZE*SIZE];\
[unroll]\
for (uint q = 0; q < SIZE*SIZE; ++q)\
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);\
\
for (uint c = 0; c < X.channels; c += L1CACHESIZE)\
{\
uint dc = groupThreadID.x;\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);\
GroupMemoryBarrierWithGroupSync();\
\
if (k < K.channels)\
{\
uint kIndex = K.Index(dy, dx, c, k);\
for (dc = 0; dc < L1CACHESIZE; ++dc)\
{\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
acc[q] = FMA(Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc], K.data[kIndex], acc[q]);\
kIndex += K.channels;\
}\
}\
GroupMemoryBarrierWithGroupSync();\
}\
}\
}\
\
uint remainderW = (O.width - x*SIZE);\
uint remainderH = (O.height - y*SIZE);\
\
if (k < K.channels)\
[unroll]\
for (q = 0; q < SIZE*SIZE; ++q)\
if (q/SIZE < remainderH && q%SIZE < remainderW)\
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);\
}\
\
}
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
CONV2D_L1CACHED(64,4, fastfma)
CONV2D_L1CACHED(32,4, fastfma)
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
#undef X_
}
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
O.Set(n, y, x, k, acc);
}
}
}

{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)