浏览代码

First stage of ML Agents update to Barracuda 0.2.x

/develop-generalizationTraining-TrainerController
Mantas Puida 6 年前
当前提交
27567062
共有 55 个文件被更改,包括 4646 次插入3834 次删除
  1. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta
  2. 2
      UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta
  3. 2
      UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta
  4. 2
      UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta
  5. 2
      UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta
  6. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta
  7. 2
      UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta
  8. 2
      UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta
  9. 2
      UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta
  10. 2
      UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta
  11. 2
      UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta
  12. 2
      UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta
  13. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat
  14. 5
      UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat
  15. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta
  16. 2
      UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta
  17. 2
      UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta
  18. 2
      UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta
  19. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta
  20. 2
      UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta
  21. 23
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md
  22. 997
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
  23. 918
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute
  24. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
  25. 68
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute
  26. 566
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute
  27. 632
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/ConvOld.compute
  28. 438
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Dense.compute
  29. 30
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/DenseFP16.compute
  30. 944
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Experimental.compute
  31. 214
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/FastNV.compute
  32. 483
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Generic.compute
  33. 44
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Random.cginc
  34. 480
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Tensor.cginc
  35. 112
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/TexConv.compute
  36. 23
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/ReleaseNotes.md
  37. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/package.json
  38. 2
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/BarracudaModelParamLoader.cs
  39. 7
      UnitySDK/Assets/ML-Agents/Scripts/LearningBrain.cs
  40. 354
      ml-agents/mlagents/trainers/barracuda.py
  41. 914
      ml-agents/mlagents/trainers/tensorflow_to_barracuda.py
  42. 2
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta
  43. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor.meta
  44. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor.meta
  45. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
  46. 7
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef.meta
  47. 8
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png
  48. 106
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelIcon.png.meta
  49. 42
      UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs
  50. 29
      UnitySDK/Assets/ML-Agents/Editor/NNModelImporter.cs
  51. 10
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs
  52. 11
      UnitySDK/Assets/ML-Agents/Scripts/InferenceBrain/NNModel.cs.meta
  53. 0
      /UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Plugins/Editor/BarracudaEditor/NNModelImporter.cs.meta

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallHardLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/3DBall/TFModels/3DBallLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/BananaCollectors/TFModels/BananaLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Basic/TFModels/BasicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Bouncer/TFModels/BouncerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerDynamicLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Crawler/TFModels/CrawlerStaticLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/GridWorld/TFModels/GridWorldLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Hallway/TFModels/HallwayLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/PushBlock/TFModels/PushBlockLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Pyramids/TFModels/PyramidsLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Reacher/TFModels/ReacherLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/BlueAgent.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.10980392, g: 0.6039216, b: 1, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

5
UnitySDK/Assets/ML-Agents/Examples/SharedAssets/Materials/Wall.mat


m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
- _SpecGlossMap:
m_Texture: {fileID: 0}
m_Scale: {x: 1, y: 1}
m_Offset: {x: 0, y: 0}
m_Floats:
- _BumpScale: 1
- _Cutoff: 0.5

m_Colors:
- _Color: {r: 0.5, g: 0.5, b: 0.5, a: 1}
- _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
- _SpecColor: {r: 0.2, g: 0.2, b: 0.2, a: 1}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/GoalieLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Soccer/TFModels/StrikerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Tennis/TFModels/TennisLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/Walker/TFModels/WalkerLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/BigWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

2
UnitySDK/Assets/ML-Agents/Examples/WallJump/TFModels/SmallWallJumpLearning.nn.meta


userData:
assetBundleName:
assetBundleVariant:
script: {fileID: 11500000, guid: 83221ad3db87f4b3b91b041047cb2bc5, type: 3}
script: {fileID: 11500000, guid: 19ed1486aa27d4903b34839f37b8f69f, type: 3}

23
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda.md


### Load Model into Barracuda
Once you have your TensorFlow (or ONNX) model converted, you can load resulting Barracuda file via `ModelLoader`:
```C#
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".bytes");
var model = ModelLoader.LoadFromStreamingAssets(modelName + ".nn");
```
Another option is to use editor model importer. Just add public `NNModel` field to your C# script and assing ``.nn`` model file via editor UI:
```C#
public NNModel modelSource;
<..>
var model = ModelLoader.Load(modelSource);
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputeFast, model)
var worker = BarracudaWorkerFactory.CreateWorker(BarracudaWorkerFactory.Type.ComputePrecompiled, model)
```
### Execute the model

Execution is asynchronous for GPU backends. Currently implementation is synchronous for CPU backends, however it is good to assume that execution will be async for all backends in the future.
### Fetch outputs
If model has only single output, then simple `worker.Fetch()` can be used, otherwise output names should be provided.
If model has only single output, then simple `worker.Peek()` can be used, otherwise output names should be provided.
var O = worker.Fetch(outputName);
var O = worker.Peek(outputName);
_Note:_ ``Peek()`` does not take ownership of the tensor. If you expect to keep tensor for longer time use ``Fetch()``
### Cleanup
As a Barracuda client you are responsible to `Dispose` _worker_, _inputs_ and _outputs_ you fetched. This is necessary to properly free GPU resources.

### Texture as output
If you want to use Barracuda execution results further in the graphics pipeline, you can copy data from `Tensor` into `RenderTexture` without stalling CPU or GPU:
```C#
var tensor = worker.Fetch();
var tensor = worker.Peek();
var texture = BarracudaTextureUtils.TensorToRenderTexture(tensor);
```
If you wish, you can reuse the same `RenderTexture` multiple times:

var tensor = worker.Fetch();
var tensor = worker.Peek();
BarracudaTextureUtils.TensorToRenderTexture(tensor, texture);
```

Convert from TensorFlow:
```bash
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.bytes
python tensorflow_to_barracuda.py Models/3DBall-tf-model.pb Destination/3DBall-bc.nn
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.bytes
python onnx_to_barracuda.py Models/mnist/model.onnx Destination/mnist-bc.nn
```
If network has multiple outputs, but you need only particular ones during the inference, there is an optional `-trim` flag to remove unused outputs and calculations.

997
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Barracuda.dll
文件差异内容过多而无法显示
查看文件

918
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Activation.compute


#pragma kernel Relu_Flat
#pragma kernel Relu_Loop
#pragma kernel Relu6_Flat
#pragma kernel Relu6_Loop
#pragma kernel Tanh_Flat
#pragma kernel Tanh_Loop
#pragma kernel Swish_Flat
#pragma kernel Swish_Loop
#pragma kernel Sigmoid_Flat
#pragma kernel Sigmoid_Loop
#pragma kernel Elu_Flat
#pragma kernel Elu_Loop
#pragma kernel LeakyRelu_Flat
#pragma kernel LeakyRelu_Loop
#pragma kernel Exp_Flat
#pragma kernel Exp_Loop
#pragma kernel Log_Flat
#pragma kernel Log_Loop
#pragma kernel Pow_Flat
#pragma kernel Pow_Loop
/*
Relu_Flat (NEW) vs Relu_Nyxc+Relu_CNyx+Relu
Compute Precompiled
VGG@1
<<<Exec #128: 59.6 ms, cpu: .9 ms, avg: 62.4 ms, result:OK <--- NEW!
<<<Exec #128: 63.6 ms, cpu: .9 ms, avg: 64.0 ms, result:OK
VGG@4
<<<Exec #16: 276.7 ms, cpu: .9 ms, avg: 272.8 ms, result:OK <--- NEW!
<<<Exec #16: 297.5 ms, cpu: .9 ms, avg: 274.4 ms, result:OK
RES@1
<<<Exec #100: 82.2 ms, cpu: 22.2 ms, avg: 81.0 ms, result:OK <--- NEW!
<<<Exec #100: 82.1 ms, cpu: 22.5 ms, avg: 85.4 ms, result:OK
PPO_2@256
<<<Exec #200: 10.3 ms, cpu: 7.6 ms, avg: 11.9 ms, result:OK <--- NEW!
<<<Exec #200: 10.9 ms, cpu: 8.3 ms, avg: 12.3 ms, result:OK
PPO_CNN@256
<<<Exec #100: 60.6 ms, cpu: 62.3 ms, avg: 65.6 ms, result:OK <--- NEW!
<<<Exec #100: 72.6 ms, cpu: 62.7 ms, avg: 66.0 ms, result:OK
*/
#pragma kernel Relu
#pragma kernel Relu_CNyx
#pragma kernel Relu_Nyxc

#pragma kernel Exp
#pragma kernel Exp_CNyx
#pragma kernel Exp_Nyxc
#pragma kernel Log
#pragma kernel Log_CNyx
#pragma kernel Log_Nyxc
#pragma kernel Pow
#pragma kernel Pow_CNyx
#pragma kernel Pow_Nyxc

TENSOR_DECL_RW(O)
float _Alpha;
uint _LoopStride;
#define FLAT_ACTIVATION(name, op_name) \
void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
if (i > O.GetLength()) return;\
\
float v = X.Get(i);\
v = op_name (v);\
O.Set(i, v);\
}
#define LOOP_ACTIVATION(name, op_name) \
void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
{\
DISPATCH_ARGS(O.length, 1, 1)\
TENSOR_ARGS2(X, O);\
\
uint i = dispatchThreadID.x;\
uint len = O.GetLength();\
\
while (i < len) {\
float v = X.Get(i); \
v = op_name (v); \
O.Set(i, v); \
i += _LoopStride; \
}\
}
#define ACTIVATION(name, op_name) \
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
FLAT_ACTIVATION(name, op_name)\
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
LOOP_ACTIVATION(name, op_name)
return 0.5f * (v + abs(v));
return 0.5f * (v + abs(v));
return min(max(0, v), 6);
return min(max(0, v), 6);
return v / (1.f + exp(-v));
return v / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
return 1.f / (1.f + exp(-v));
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
if (v <= 0)
v = _Alpha * (exp(v) - 1);
return v;
return max(v, _Alpha * v);
return max(v, _Alpha * v);
float signed_pow(float f, float e)
float signed_pow(float f)
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
float e = _Alpha;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
ACTIVATION(Relu, relu)
ACTIVATION(Relu6, relu6)
ACTIVATION(Tanh, tanh)
ACTIVATION(Sigmoid, sigmoid)
ACTIVATION(Swish, swish)
ACTIVATION(Elu, elu)
ACTIVATION(LeakyRelu, lrelu)
ACTIVATION(Exp, exp)
ACTIVATION(Log, log)
ACTIVATION(Pow, signed_pow)
// -------------------
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint c = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (c >= O.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void Log(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
}
for (uint n = 0; n < X.batch; ++n)
{
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = relu6(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = tanh(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = sigmoid(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = swish(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = elu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = lrelu(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((16,16,1), (16,8,1), (16,4,1))
void Log_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
}
NUMTHREADS((512,1,1), (128,1,1), (64,1,1))
void Log_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = exp(v);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = log(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.channels, O.batch * O.height * O.width, 1);
TENSOR_ARGS2(X, O);
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint c = dispatchThreadID.x;
uint nyx = dispatchThreadID.y;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (c >= X.channels) return;
if (n >= X.batch) return;
if (c >= X.channels) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.batch * O.height * O.width * O.channels, 1, 1)
TENSOR_ARGS2(X, O);
uint nyxc = dispatchThreadID.x;
uint nyxc = dispatchThreadID.x;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
uint c = nyxc % X.channels;
uint nyx = nyxc / X.channels;
uint x = nyx % X.width;
uint ny = nyx / X.width;
uint y = ny % X.height;
uint n = ny / X.height;
if (n >= X.batch) return;
if (n >= X.batch) return;
float v = X.Get(n, y, x, c);
v = signed_pow(v, _Alpha);
O.Set(n, y, x, c, v);
float v = X.Get(n, y, x, c);
v = signed_pow(v);
O.Set(n, y, x, c, v);
}

DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
DISPATCH_ARGS(O.flatWidth, O.flatHeight, 1);
TENSOR_ARGS2(X, O);
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
uint x = dispatchThreadID.x;
uint y = dispatchThreadID.y;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
if (x >= O.GetFlatWidth()) return;
if (y >= O.GetFlatHeight()) return;
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float maxV = -FLT_MAX;
for (uint i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
if (v > maxV)
maxV = v;
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float acc = 0.0f;
for (i = 0; i < X.GetFlatWidth(); ++i)
{
float v = X.Get(y, i);
acc += exp(v - maxV);
}
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
float v = X.Get(y, x);
v = exp(v - maxV) / acc;
O.Set(y, x, v);
}

944
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/BarracudaReferenceImpl.compute
文件差异内容过多而无法显示
查看文件

68
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Broadcast.compute


NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastAdd(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastSub(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMul(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastDiv(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

float signed_pow(float f, float e)
{
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
// handle negative f
float v = pow(abs(f), e);
float s = (e % 2 == 1) ?
sign(f): // exponent is odd => sign(f) * pow(abs(f), e)
1; // exponent is even => pow(abs(f), e)
return v * s;
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMin(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
void BroadcastMax(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
DISPATCH_ARGS(O.channels, O.width, O.height);
TENSOR_ARGS3(X, B, O);
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z;
if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return;
for (uint n = 0; n < X.batch; ++n)
{

566
UnitySDK/Assets/ML-Agents/Plugins/Barracuda.Core/Barracuda/Resources/Conv.compute


#pragma kernel Conv2D
#pragma kernel Conv2D_RegisterBlock4x2
//#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel Conv2D_L1Cached64_RegisterBlock4x4
#pragma kernel DepthwiseConv2D

NUMTHREADS((16,4,4), (8,4,4), (4,4,4))
void Conv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
// @TODO: investigate
// WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
for (uint c = 0; c < X.channels; ++c)
acc = fastfma(X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c), K.Get(dy, dx, c, k), acc);
}
}
O.Set(n, y, x, k, acc);
}
O.Set(n, y, x, k, acc);
}
}

void Conv2D_RegisterBlock4x2(uint3 dispatchThreadID : SV_DispatchThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
if (k >= K.channels) return;
if (x*SIZE_W >= O.width) return;
if (y*SIZE_H >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
acc[q] = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE_H*SIZE_W];
[unroll]
for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
for (uint c = 0; c < X.channels; ++c)
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
}
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
[unroll]
for (q = 0; q < SIZE_H*SIZE_W; ++q)
O.Set(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]);
}
}
#undef SIZE_W
#undef SIZE_H

[numthreads(L1CACHESIZE, 1, 1)]
void Conv2D_L1Cached64_RegisterBlock4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
#define X_ Conv2D_L1Cached64_Reg_Loop_safe_X
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
uint x = groupID.y;
uint y = groupID.z;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
// need all threads to load channels, thus will do late check against kernel count
if (x*SIZE >= O.width) return;
if (y*SIZE >= O.height) return;
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint n = 0; n < O.batch; ++n)
{
float acc[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
acc[q] = B.SafeGet(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
{
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos[SIZE*SIZE];
[unroll]
for (uint q = 0; q < SIZE*SIZE; ++q)
pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
for (uint c = 0; c < X.channels; c += L1CACHESIZE)
{
// Cache X
uint dc = groupThreadID.x;
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
GroupMemoryBarrierWithGroupSync();
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE; ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
// X * K
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
{
uint kIndex = K.Index(dy, dx, c, k);
for (dc = 0; dc < L1CACHESIZE && (c + dc) < K.GetKernelDepth(); ++dc)
{
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
kIndex += K.channels;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
uint remainderW = (O.width - x*SIZE);
uint remainderH = (O.height - y*SIZE);
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
if (k < K.channels) // need all threads to load channels, thus late check against kernel count
[unroll]
for (q = 0; q < SIZE*SIZE; ++q)
if (q/SIZE < remainderH && q%SIZE < remainderW)
O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
}
#undef X_
#undef X_
}

DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
DISPATCH_ARGS(K.kernelCount, O.width, O.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
if (k >= K.channels) return;
if (x >= O.width) return;
if (y >= O.height) return;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftCorner = _Pad.xy;
uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
uint2 leftKernelCorner = uint2(x, y) * _Stride.xy;
uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight());
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner))
{
// path with edge-cases checks
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
O.Set(n, y, x, k, acc);
}
}
else
{
// kernel is guaranteed to be within X,
// no need to check against edge-cases
leftKernelCorner -= leftCorner;
for (uint n = 0; n < O.batch; ++n)
{
float acc = B.Get(k);
for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
{
uint2 pos = leftKernelCorner + uint2(dx, dy);
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
acc = fastfma(
X.Get(n, pos, k),
K.Get(dy, dx, 0, k),
acc);
}
O.Set(n, y, x, k, acc);
}
}
O.Set(n, y, x, k, acc);
}
}
}

{
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
// NOTE: dispatched over X (not O)
DISPATCH_ARGS(K.kernelCount, X.width, X.height);
TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
uint k = dispatchThreadID.x;
uint x = dispatchThreadID.y;
uint y = dispatchThreadID.z;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
if (k >= K.channels) return;
if (x >= X.width) return;
if (y >= X.height) return;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
uint2 pad = _Pad.xy / _Stride.xy;
uint2 leftCorner = pad;
uint2 rightCorner = uint2(X.width, X.height) + pad;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
for (uint n = 0; n < O.batch; ++n)
{
for (uint sy = 0; sy < _Stride.y; ++sy)
{
for (uint sx = 0; sx < _Stride.x; ++sx)
{
float acc = B.Get(k);
for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
{
for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
{
uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
if (any(pos < leftCorner)) continue;
if (any(pos >= rightCorner)) continue;
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
for (uint c = 0; c < X.channels; ++c)
{
acc = fastfma( X.Get(n, pos - leftCorner, c),
K.Get( K.GetKernelHeight() - 1 - dy,
K.GetKernelWidth() - 1 - dx, c, k),
acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
uint oy = y * _Stride.y + sy;
uint ox = x * _Stride.x + sx;
if (oy < O.height && ox < O.width)
O.Set(n, oy, ox, k, acc);
}
}
}
}
#undef L1CACHESIZE