// Each #kernel tells which function to compile; you can have many kernels #pragma kernel PresenseMask #pragma kernel ClearCounts #pragma kernel ClearPresenceMask #pragma kernel CombineCounts #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Packing.hlsl" // Create a RenderTexture with enableRandomWrite flag and set it // with cs.SetTexture Texture2D SegmentationTexture; SamplerState sampler_SegmentationTexture; StructuredBuffer InstanceIdToClassId; RWStructuredBuffer InstanceIdPresenceMask; RWStructuredBuffer ClassCounts; uint Width; uint Height; uint SampleSegmentationTexture(uint2 pixelCoord) { float2 sampleCoord = pixelCoord + float2(.5, .5); float4 segmentationSample = SegmentationTexture.SampleLevel(sampler_SegmentationTexture, sampleCoord * float2(1/(float)Width, 1/(float)Height), 0); uint objectId = PackFloatToUInt(segmentationSample.x, 0, 8) + PackFloatToUInt(segmentationSample.y, 8, 8) + PackFloatToUInt(segmentationSample.z, 16, 8) + PackFloatToUInt(segmentationSample.w, 24, 8); return objectId; } // For each pixel in the segmentation image, set InstanceIdPresenceMask[pixelValue] to 1 [numthreads(8,8,1)] void PresenseMask (uint3 id : SV_DispatchThreadID) { uint instanceId = SampleSegmentationTexture(id.xy); if (instanceId != 0) InstanceIdPresenceMask[instanceId] = 1; //Attempt at packing presense into single bits. Good for memory, bad for perf due to InterlockedOr. //int bitOffset = objectId % 32; //int maskOffset = objectId / 32; //InterlockedOr(IdPresenceMask[maskOffset], 1 << bitOffset); } [numthreads(8,1,1)] void ClearCounts (uint3 id : SV_DispatchThreadID) { ClassCounts[id.x] = 0; } [numthreads(8,1,1)] void ClearPresenceMask(uint3 id : SV_DispatchThreadID) { InstanceIdPresenceMask[id.x] = 0; } // For instanceId in InstanceIdPresenceMask, increment the corresponding class count. Uses InterlockedAdd, which may be a significant bottleneck. [numthreads(8,1,1)] void CombineCounts (uint3 id : SV_DispatchThreadID) { uint mask = InstanceIdPresenceMask[id.x]; if (mask > 0) InterlockedAdd(ClassCounts[InstanceIdToClassId[id.x]], 1); //Attempt at packing presense into single bits. Good for memory, bad for perf due to InterlockedOr in PresenseMask(...). //int idStart = id.x * 32; //for(int i = 0; i < 32 ; i++) //{ // bool isPresent = mask & 1 << i; // if (isPresent) // { // InterlockedAdd(ClassCounts[InstanceIdToClassId[idStart + i]], 1); // } //} }