Dispatch the SSS CS in 4x swizzled 16x16 groups per a 32x32 macrotile

7 年前 · 811c0769
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/SpaceFillingCurves.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/SpaceFillingCurves.hlsl
 // Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 uint Part1By1(uint x)
 {
-  x &= 0x0000ffff;                  // x = ---- ---- ---- ---- fedc ba98 7654 3210
-  x = (x ^ (x <<  8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
-  x = (x ^ (x <<  4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
-  x = (x ^ (x <<  2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
-  x = (x ^ (x <<  1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
-  return x;
+    x &= 0x0000ffff;                  // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    x = (x ^ (x <<  8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x <<  4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x <<  2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x <<  1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    return x;
 }

 // "Insert" two 0 bits after each of the 10 low bits of x/
-  x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
-  x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
-  x = (x ^ (x <<  8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
-  x = (x ^ (x <<  4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
-  x = (x ^ (x <<  2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
-  return x;
+    x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+    x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+    x = (x ^ (x <<  8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+    x = (x ^ (x <<  4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+    x = (x ^ (x <<  2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+    return x;
 }

 // Inverse of Part1By1 - "delete" all odd-indexed bits/
-  x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
-  x = (x ^ (x >>  1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
-  x = (x ^ (x >>  2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
-  x = (x ^ (x >>  4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
-  x = (x ^ (x >>  8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
-  return x;
+    x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+    x = (x ^ (x >>  1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+    x = (x ^ (x >>  2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+    x = (x ^ (x >>  4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+    x = (x ^ (x >>  8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
+    return x;
 }

 // Inverse of Part1By2 - "delete" all bits not at positions divisible by 3/
-  x &= 0x09249249;                  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
-  x = (x ^ (x >>  2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
-  x = (x ^ (x >>  4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
-  x = (x ^ (x >>  8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
-  x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
-  return x;
+    x &= 0x09249249;                  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+    x = (x ^ (x >>  2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+    x = (x ^ (x >>  4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+    x = (x ^ (x >>  8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+    x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
+    return x;
-	return (Part1By1(coord.y) << 1) + Part1By1(coord.x);
+    return (Part1By1(coord.y) << 1) + Part1By1(coord.x);
-	return (Part1By2(coord.z) << 2) + (Part1By2(coord.y) << 1) + Part1By2(coord.x);
+    return (Part1By2(coord.z) << 2) + (Part1By2(coord.y) << 1) + Part1By2(coord.x);
-	return uint2(Compact1By1(code >> 0), Compact1By1(code >> 1));
+    return uint2(Compact1By1(code >> 0), Compact1By1(code >> 1));
-	return uint3(Compact1By2(code >> 0), Compact1By2(code >> 1), Compact1By2(code >> 2));
+    return uint3(Compact1By2(code >> 0), Compact1By2(code >> 1), Compact1By2(code >> 2));
+}
+
+uint InterleaveQuad(uint2 quad)
+{
+    return quad.x + 2 * quad.y;
+}
+
+uint2 DeinterleaveQuad(uint code)
+{
+    return uint2(code & 1, (code >> 1) & 1);
 }

 #endif // UNITY_SPACE_FILLING_CURVES_INCLUDED
--- a/ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs
+++ b/ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs
                        cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, HDShaderIDs._CameraFilteringBuffer, m_CameraFilteringBufferRT);

                        // Perform the SSS filtering pass which fills 'm_CameraFilteringBufferRT'.
-                        cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, ((int)hdCamera.screenSize.x + 15) / 16, ((int)hdCamera.screenSize.y + 15) / 16, 1);
+                        // We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
+                        cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, 4, ((int)hdCamera.screenSize.x + 31) / 32, ((int)hdCamera.screenSize.y + 31) / 32);

                        cmd.SetGlobalTexture(HDShaderIDs._IrradianceSource, m_CameraFilteringBufferRT);  // Cannot set a RT on a material

                        cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, HDShaderIDs._CameraColorTexture, m_CameraColorBufferRT);

                        // Perform the SSS filtering pass which performs an in-place update of 'm_CameraColorBufferRT'.
-                        cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, ((int)hdCamera.screenSize.x + 15) / 16, ((int)hdCamera.screenSize.y + 15) / 16, 1);
+                        // We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
+                        cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, 4, ((int)hdCamera.screenSize.x + 31) / 32, ((int)hdCamera.screenSize.y + 31) / 32);
                    }
                }
                else
--- a/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
 #pragma kernel SubsurfaceScattering

 [numthreads(GROUP_SIZE_2D, 1, 1)]
-void SubsurfaceScattering(uint2 groupId       : SV_GroupID,
+void SubsurfaceScattering(uint3 groupId       : SV_GroupID,
+    // We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
+    // Therefore, we need to unswizzle. TODO: macrotile order.
+    uint2 groupOffset = DeinterleaveQuad(groupId.x);
+    uint2 groupCoord  = uint2(groupId.y * 2 + groupOffset.x, groupId.z * 2 + groupOffset.y);
+
    // Note: any factor of 64 is a suitable wave size for our algorithm.
    uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
    uint laneIndex = groupThreadId % 64;
    uint  mortonCode  = groupThreadId;
    uint2 localCoord  = DecodeMorton2D(mortonCode);
-    uint2 tileAnchor  = groupId * GROUP_SIZE_1D;
-    uint2 pixelCoord  = tileAnchor + localCoord;
-    int2  cacheAnchor = (int2)tileAnchor - TEXTURE_CACHE_BORDER;
+    uint2 groupAnchor = groupCoord * GROUP_SIZE_1D;
+    uint2 pixelCoord  = groupAnchor + localCoord;
+    int2  cacheAnchor = (int2)groupAnchor - TEXTURE_CACHE_BORDER;
    uint2 cacheCoord  = localCoord + TEXTURE_CACHE_BORDER;
    float stencilRef  = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;

-        float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;
-        float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;
-        float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;
-        float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;
+        float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(0, 0)).r;
+        float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(1, 0)).r;
+        float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(0, 1)).r;
+        float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(1, 1)).r;

        // Perform the stencil test (reject at the tile rate).
        processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);
    [branch] if (quadIndex < numBorderQuadsPerWave)
    {
        // Fetch another texel into the LDS.
-        uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1);
+        uint2 startQuad = halfCacheWidthInQuads * DeinterleaveQuad(waveIndex);

        uint2 quadCoord;

                break;
        }

-        uint2  cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
-        int2   pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
+        uint2  cacheCoord2 = 2 * (startQuad + quadCoord) + DeinterleaveQuad(laneIndex);
+        int2   pixelCoord2 = (int2)(groupAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
        float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
        float  viewZ2      = 0;