Port SSS to compute WIP

8 年前 · f30f49e7
--- a/Assets/ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs
+++ b/Assets/ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs

        public void SetupComputeShader(ComputeShader cs, CommandBuffer cmd)
        {
-            cmd.SetComputeMatrixParam(cs, "_ViewMatrix",         viewMatrix);
-            cmd.SetComputeMatrixParam(cs, "_InvViewMatrix",      viewMatrix.inverse);
-            cmd.SetComputeMatrixParam(cs, "_ProjMatrix",         projMatrix);
-            cmd.SetComputeMatrixParam(cs, "_InvProjMatrix",      projMatrix.inverse);
-            cmd.SetComputeMatrixParam(cs, "_ViewProjMatrix",     viewProjMatrix);
-            cmd.SetComputeMatrixParam(cs, "_InvViewProjMatrix",  viewProjMatrix.inverse);
-            cmd.SetComputeVectorParam(cs, "_InvProjParam",       invProjParam);
-            cmd.SetComputeVectorParam(cs, "_ScreenSize",         screenSize);
-            cmd.SetComputeMatrixParam(cs, "_PrevViewProjMatrix", prevViewProjMatrix);
-            cmd.SetComputeVectorArrayParam(cs, "_FrustumPlanes", frustumPlaneEquations);
+            cmd.SetComputeMatrixParam(cs, "_ViewMatrix",          viewMatrix);
+            cmd.SetComputeMatrixParam(cs, "_InvViewMatrix",       viewMatrix.inverse);
+            cmd.SetComputeMatrixParam(cs, "_ProjMatrix",          projMatrix);
+            cmd.SetComputeMatrixParam(cs, "_InvProjMatrix",       projMatrix.inverse);
+            cmd.SetComputeMatrixParam(cs, "_ViewProjMatrix",      viewProjMatrix);
+            cmd.SetComputeMatrixParam(cs, "_InvViewProjMatrix",   viewProjMatrix.inverse);
+            cmd.SetComputeVectorParam(cs, "_InvProjParam",        invProjParam);
+            cmd.SetComputeVectorParam(cs, "_ScreenSize",          screenSize);
+            cmd.SetComputeMatrixParam(cs, "_PrevViewProjMatrix",  prevViewProjMatrix);
+            cmd.SetComputeVectorArrayParam(cs, "_FrustumPlanes",  frustumPlaneEquations);
+            // Copy values set by Unity which are not configured in scripts.
+            cmd.SetComputeVectorParam(cs, "unity_OrthoParams",    Shader.GetGlobalVector("unity_OrthoParams"));
+            cmd.SetComputeVectorParam(cs, "_ProjectionParams",    Shader.GetGlobalVector("_ProjectionParams"));
+            cmd.SetComputeVectorParam(cs, "_ScreenParams",        Shader.GetGlobalVector("_ScreenParams"));
+            cmd.SetComputeVectorParam(cs, "_ZBufferParams",       Shader.GetGlobalVector("_ZBufferParams"));
+            cmd.SetComputeVectorParam(cs, "_WorldSpaceCameraPos", Shader.GetGlobalVector("_WorldSpaceCameraPos"));
        }
    }

        Material m_CopyStencilBuffer;

        // Various set of material use in render loop
-        Material m_SssHorizontalFilterAndCombinePass;
+        ComputeShader m_SubsurfaceScatteringCS;
+        int m_SubsurfaceScatteringKernel;
+        Material m_SssHorizontalFilterAndCombinePass;
        // <<< Old SSS Model

        Material m_CameraMotionVectorsMaterial;
        // Old SSS Model >>>
        public void CreateSssMaterials(bool useDisneySSS)
        {
+            m_SubsurfaceScatteringCS     = AssetDatabase.LoadAssetAtPath<ComputeShader>("Assets/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute");
+            m_SubsurfaceScatteringKernel = m_SubsurfaceScatteringCS.FindKernel("SubsurfaceScattering");
+
            Utilities.Destroy(m_SssVerticalFilterPass);
            m_SssVerticalFilterPass = Utilities.CreateEngineMaterial("Hidden/HDRenderPipeline/SubsurfaceScattering");
            Utilities.SelectKeyword(m_SssVerticalFilterPass, "SSS_MODEL_DISNEY", "SSS_MODEL_BASIC", useDisneySSS);
            {
                if (sssSettings.useDisneySSS)
                {
+                    hdCamera.SetupComputeShader(m_SubsurfaceScatteringCS, cmd);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_GBufferTexture0",    m_gbufferManager.GetGBuffers()[0]);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_GBufferTexture1",    m_gbufferManager.GetGBuffers()[1]);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_GBufferTexture2",    m_gbufferManager.GetGBuffers()[2]);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_GBufferTexture3",    m_gbufferManager.GetGBuffers()[3]);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_DepthTexture",       GetDepthTexture());
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_StencilTexture",     GetStencilTexture());
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_HTile",              GetHTile());
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_IrradianceSource",   m_CameraDiffuseIrradianceBufferRT);
+                    cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, "_CameraColorTexture", m_CameraColorBufferRT);
+                    cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, ((int)hdCamera.screenSize.x + 15) / 16, ((int)hdCamera.screenSize.y + 15) / 16, 1);
+                    return;
+
                    cmd.SetGlobalTexture("_IrradianceSource", m_CameraDiffuseIrradianceBufferRT); // Cannot set a RT on a material
                    m_SssHorizontalFilterAndCombinePass.SetFloatArray("_WorldScales",            sssParameters.worldScales);
                    m_SssHorizontalFilterAndCombinePass.SetFloatArray("_FilterKernelsNearField", sssParameters.filterKernelsNearField);
--- a/Assets/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/TilePass.cs
+++ b/Assets/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/TilePass.cs
                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_SinTime", Shader.GetGlobalVector("_SinTime"));
                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_CosTime", Shader.GetGlobalVector("_CosTime"));
                                cmd.SetComputeVectorParam(shadeOpaqueShader, "unity_DeltaTime", Shader.GetGlobalVector("unity_DeltaTime"));
-                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_WorldSpaceCameraPos", Shader.GetGlobalVector("_WorldSpaceCameraPos"));
-                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_ProjectionParams", Shader.GetGlobalVector("_ProjectionParams"));
-                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_ScreenParams", Shader.GetGlobalVector("_ScreenParams"));
-                                cmd.SetComputeVectorParam(shadeOpaqueShader, "_ZBufferParams", Shader.GetGlobalVector("_ZBufferParams"));
-                                cmd.SetComputeVectorParam(shadeOpaqueShader, "unity_OrthoParams", Shader.GetGlobalVector("unity_OrthoParams"));
                                cmd.SetComputeIntParam(shadeOpaqueShader, "_EnvLightSkyEnabled", Shader.GetGlobalInt("_EnvLightSkyEnabled"));

                                Texture skyTexture = Shader.GetGlobalTexture("_SkyTexture");
--- a/Assets/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
+++ b/Assets/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
+// ===================== Performs integration of the Disney BSSRDF over a disk =====================
+
+//--------------------------------------------------------------------------------------------------
+// Definitions
+//--------------------------------------------------------------------------------------------------
+
+#pragma enable_d3d11_debug_symbols
+
+// Tweak parameters.
+#define SSS_BILATERAL_FILTER  1
+#define SSS_USE_TANGENT_PLANE 0
+#define SSS_CLAMP_ARTIFACT    0
+#define SSS_DEBUG_LOD         0
+#define SSS_DEBUG_NORMAL_VS   0
+
+// Do not modify these.
+#define SSS_PASS              1
+#define MILLIMETERS_PER_METER 1000
+#define CENTIMETERS_PER_METER 100
+#define GROUP_SIZE_1D         16
+#define GROUP_SIZE_2D         (GROUP_SIZE_1D * GROUP_SIZE_1D)
+#define TEXTURE_CACHE_BORDER  2
+#define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER)
+
+//--------------------------------------------------------------------------------------------------
+// Included headers
+//--------------------------------------------------------------------------------------------------
+
+#include "../../../../ShaderLibrary/Common.hlsl"
+#include "../../../../ShaderLibrary/SpaceFillingCurves.hlsl"
+#include "../../../ShaderConfig.cs.hlsl"
+#include "../../../ShaderVariables.hlsl"
+#define UNITY_MATERIAL_LIT
+#include "../../../Material/Material.hlsl"
+#include "../../../Lighting/LightDefinition.cs.hlsl"
+
+//--------------------------------------------------------------------------------------------------
+// Inputs & outputs
+//--------------------------------------------------------------------------------------------------
+
+float _WorldScales[SSS_N_PROFILES];                                         // Size of the world unit in meters
+float _FilterKernelsNearField[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD][2]; // 0 = radius, 1 = reciprocal of the PDF
+float _FilterKernelsFarField[SSS_N_PROFILES][SSS_N_SAMPLES_FAR_FIELD][2];   // 0 = radius, 1 = reciprocal of the PDF
+
+DECLARE_GBUFFER_TEXTURE(_GBufferTexture); // Contains the albedo and SSS parameters
+TEXTURE2D(_DepthTexture);                 // Z-buffer
+TEXTURE2D(_StencilTexture);               // DXGI_FORMAT_R8_UINT is not supported by Unity
+TEXTURE2D(_HTile);                        // DXGI_FORMAT_R8_UINT is not supported by Unity
+TEXTURE2D(_IrradianceSource);             // Includes transmitted light
+
+// Contains the HDR color for non-SSS materials.
+// In case of SSS, it only contains the specular lighting, which we additively blend with the SSS lighting.
+RW_TEXTURE2D(float4, _CameraColorTexture);
+
+//--------------------------------------------------------------------------------------------------
+// Implementation
+//--------------------------------------------------------------------------------------------------
+
+// Computes the value of the integrand over a disk: (2 * PI * r) * KernelVal().
+// N.b.: the returned value is multiplied by 4. It is irrelevant due to weight renormalization.
+float3 KernelValCircle(float r, float3 S)
+{
+    float3 expOneThird = exp(((-1.0 / 3.0) * r) * S);
+    return /* 0.25 * */ S * (expOneThird + expOneThird * expOneThird * expOneThird);
+}
+
+// Computes F(r)/P(r), s.t. r = sqrt(a^2 + b^2).
+// Rescaling of the PDF is handled by 'totalWeight'.
+float3 ComputeBilateralWeight(float a2, float b, float mmPerUnit, float3 S, float rcpPdf)
+{
+#if (SSS_BILATERAL_FILTER == 0)
+    b = 0;
+#endif
+
+#if SSS_USE_TANGENT_PLANE
+    // Both 'a2' and 'b2' require unit conversion.
+    float r = sqrt(a2 + b * b) * mmPerUnit;
+#else
+    // Only 'b2' requires unit conversion.
+    float r = sqrt(a2 + (b * mmPerUnit) * (b * mmPerUnit));
+#endif
+
+#if SSS_CLAMP_ARTIFACT
+    return saturate(KernelValCircle(r, S) * rcpPdf);
+#else
+    return KernelValCircle(r, S) * rcpPdf;
+#endif
+}
+
+#define SSS_ITER(i, n, kernel, profileID, shapeParam, centerPosUnSS, centerPosVS,   \
+                 useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,       \
+                 totalIrradiance, totalWeight)                                      \
+{                                                                                   \
+    float  r   = kernel[profileID][i][0];                                           \
+    /* The relative sample position is known at compile time. */                    \
+    float  phi = SampleDiskFibonacci(i, n).y;                                       \
+    float2 vec = r * float2(cos(phi), sin(phi));                                    \
+                                                                                    \
+    /* Compute the screen-space position and the associated irradiance. */          \
+    float2 position; float3 irradiance;                                             \
+    /* Compute the squared distance (in mm) in the screen-aligned plane. */         \
+    float dXY2;                                                                     \
+                                                                                    \
+    if (useTangentPlane)                                                            \
+    {                                                                               \
+        /* 'vec' is given relative to the tangent frame. */                         \
+        float3 relPosVS   = vec.x * tangentX + vec.y * tangentY;                    \
+        float3 positionVS = centerPosVS + relPosVS;                                 \
+        float4 positionCS = mul(projMatrix, float4(positionVS, 1));                 \
+        float2 positionSS = ComputeScreenSpacePosition(positionCS);                 \
+                                                                                    \
+        position   = positionSS * _ScreenSize.xy;                                   \
+        irradiance = LOAD_TEXTURE2D(_IrradianceSource, position).rgb;               \
+        dXY2       = dot(relPosVS.xy, relPosVS.xy);                                 \
+    }                                                                               \
+    else                                                                            \
+    {                                                                               \
+        /* 'vec' is given directly in screen-space. */                              \
+        position   = centerPosUnSS + vec * pixelsPerMm;                             \
+        irradiance = LOAD_TEXTURE2D(_IrradianceSource, position).rgb;               \
+        dXY2       = r * r;                                                         \
+    }                                                                               \
+                                                                                    \
+    /* TODO: see if making this a [branch] improves performance. */                 \
+    [flatten]                                                                       \
+    if (any(irradiance))                                                            \
+    {                                                                               \
+        /* Apply bilateral weighting. */                                            \
+        float  z = LOAD_TEXTURE2D(_MainDepthTexture, position).r;                   \
+        float  d = LinearEyeDepth(z, _ZBufferParams);                               \
+        float  t = d - centerPosVS.z;                                               \
+        float  p = kernel[profileID][i][1];                                         \
+        float3 w = ComputeBilateralWeight(dXY2, t, mmPerUnit, shapeParam, p);       \
+                                                                                    \
+        totalIrradiance += w * irradiance;                                          \
+        totalWeight     += w;                                                       \
+    }                                                                               \
+    else                                                                            \
+    {                                                                               \
+        /*************************************************************************/ \
+        /* The irradiance is 0. This could happen for 3 reasons.                 */ \
+        /* Most likely, the surface fragment does not have an SSS material.      */ \
+        /* Alternatively, our sample comes from a region without any geometry.   */ \
+        /* Finally, the surface fragment could be completely shadowed.           */ \
+        /* Our blur is energy-preserving, so 'centerWeight' should be set to 0.  */ \
+        /* We do not terminate the loop since we want to gather the contribution */ \
+        /* of the remaining samples (e.g. in case of hair covering skin).        */ \
+        /* Note: See comment in the output of deferred.shader                    */ \
+        /*************************************************************************/ \
+    }                                                                               \
+}
+
+#define SSS_LOOP(n, kernel, profileID, shapeParam, centerPosUnSS, centerPosVS,      \
+                 useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,       \
+                 totalIrradiance, totalWeight)                                      \
+{                                                                                   \
+    float  centerRadius = kernel[profileID][0][0];                                  \
+    float  centerRcpPdf = kernel[profileID][0][1];                                  \
+    float3 centerWeight = KernelValCircle(centerRadius, shapeParam) * centerRcpPdf; \
+                                                                                    \
+    totalIrradiance = centerWeight * centerIrradiance;                              \
+    totalWeight     = centerWeight;                                                 \
+                                                                                    \
+    /* Integrate over the screen-aligned or tangent plane in the view space. */     \
+    [unroll]                                                                        \
+    for (uint i = 1; i < n; i++)                                                    \
+    {                                                                               \
+        SSS_ITER(i, n, kernel, profileID, shapeParam, centerPosUnSS, centerPosVS,   \
+                 useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,       \
+                 totalIrradiance, totalWeight)                                      \
+    }                                                                               \
+}
+
+bool StencilTest(int2 pixelCoord, float stencilRef)
+{
+    int2 tileCoord = pixelCoord / 8;
+
+    // Perform the stencil test (reject at the tile rate).
+    bool passedStencilTest = stencilRef == LOAD_TEXTURE2D(_HTile, tileCoord).r;
+
+    [branch] if (passedStencilTest)
+    {
+        // Our copy of HTile does not allow to accept at the tile rate.
+        // Therefore, we have to additionally perform the stencil test at the pixel rate.
+        passedStencilTest = stencilRef == LOAD_TEXTURE2D(_StencilTexture, pixelCoord).r;
+    }
+
+    return passedStencilTest;
+}
+
+#pragma kernel SubsurfaceScattering
+
+groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D][TEXTURE_CACHE_SIZE_1D]; // float4(irradiance, linearDepth)
+groupshared bool   processGroup;
+
+[numthreads(GROUP_SIZE_2D, 1, 1)]
+void SubsurfaceScattering(uint2 groupId       : SV_GroupID,
+                          uint  groupThreadId : SV_GroupThreadID)
+{
+    const uint waveIndex = groupThreadId / 64;
+    const uint laneIndex = groupThreadId % 64;
+    const uint quadIndex = laneIndex / 4;
+
+    // Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.
+    const uint  mortonCode = groupThreadId;
+    const uint2 localCoord = DecodeMorton2D(mortonCode);
+    const uint2 tileAnchor = groupId * GROUP_SIZE_1D;
+    const uint2 pixelCoord = tileAnchor + localCoord;
+    const uint2 cacheCoord = localCoord + TEXTURE_CACHE_BORDER;
+    const uint2 cacheMinPt = tileAnchor - TEXTURE_CACHE_BORDER;
+    const uint2 cacheMaxPt = cacheMinPt + TEXTURE_CACHE_SIZE_1D;
+    const float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;
+
+    [branch] if (groupThreadId == 0)
+    {
+        // Check whether the thread group needs to perform any work.
+        float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;
+        float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;
+        float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;
+        float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;
+
+        // Perform the stencil test (reject at the tile rate).
+        processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);
+    }
+
+    // Wait for the LDS.
+    GroupMemoryBarrierWithGroupSync();
+
+    [branch] if (!processGroup) { return; }
+
+    float3 centerIrradiance;
+    float  centerDepth;
+    float4 cachedValue = float4(0, 0, 0, 0);
+
+    bool passedStencilTest = StencilTest(pixelCoord, stencilRef);
+
+    [branch] if (passedStencilTest)
+    {
+        centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
+        centerDepth      = LOAD_TEXTURE2D(_DepthTexture,     pixelCoord).r;
+        cachedValue      = float4(centerIrradiance, LinearEyeDepth(centerDepth, _ZBufferParams));
+    }
+
+    // Populate the central region of the LDS cache.
+    textureCache[cacheCoord.x][cacheCoord.y] = cachedValue;
+
+    const uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;
+    const uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;
+
+    [branch] if (quadIndex < numBorderQuadsPerWave)
+    {
+        // Fetch another texel into the LDS.
+        uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1);
+
+        uint2 quadCoord;
+
+        // The traversal order is such that the quad's X coordinate is monotonically increasing.
+        // Note: the compiler can heavily optimize the code below, as the switch is scalar,
+        // and there are very few unique values due to the symmetry.
+        switch (waveIndex)
+        {
+            case 0:
+                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
+                quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex));
+                break;
+            case 1:
+                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
+                quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
+                break;
+            case 2:
+                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
+                quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1);
+                break;
+            case 3:
+                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
+                quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex);
+                break;
+        }
+
+        uint2  cacheCoord2  = (startQuad + quadCoord) * 2 + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
+        int2   pixelCoord2  = (int2)(tileAnchor + cacheCoord2 - TEXTURE_CACHE_BORDER);
+        float4 cachedValue2 = float4(0, 0, 0, 0);
+
+        [branch] if (StencilTest(pixelCoord2, stencilRef))
+        {
+            float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
+            float  depth2      = LOAD_TEXTURE2D(_DepthTexture,     pixelCoord2).r;
+            cachedValue2       = float4(irradiance2, LinearEyeDepth(depth2, _ZBufferParams));
+        }
+
+        // Populate the border region of the LDS cache.
+        textureCache[cacheCoord2.x][cacheCoord2.y] = cachedValue2;
+    }
+
+    // Wait for the LDS.
+    GroupMemoryBarrierWithGroupSync();
+
+    [branch] if (!passedStencilTest) { return; }
+
+    PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw);
+
+    float3 unused;
+
+    // The result of the stencil test allows us to statically determine the material type (SSS).
+    BSDFData bsdfData;
+    FETCH_GBUFFER(gbuffer, _GBufferTexture, pixelCoord);
+    DECODE_FROM_GBUFFER(gbuffer, MATERIALFEATUREFLAGS_LIT_SSS, bsdfData, unused);
+
+    int    profileID   = bsdfData.subsurfaceProfile;
+    float  distScale   = bsdfData.subsurfaceRadius;
+    float3 shapeParam  = _ShapeParams[profileID].rgb;
+    float  maxDistance = _ShapeParams[profileID].a;
+
+    // Reconstruct the view-space position corresponding to the central sample.
+    float2 centerPosSS = posInput.positionSS;
+    float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw;
+    float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, _InvProjMatrix);
+    float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, _InvProjMatrix);
+
+    // Rescaling the filter is equivalent to inversely scaling the world.
+    float mmPerUnit  = MILLIMETERS_PER_METER * (_WorldScales[profileID] / distScale);
+    float unitsPerMm = rcp(mmPerUnit);
+
+    // Compute the view-space dimensions of the pixel as a quad projected onto geometry.
+    float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
+    float2 pixelsPerMm   = rcp(unitsPerPixel) * unitsPerMm;
+
+    // We perform point sampling. Therefore, we can avoid the cost
+    // of filtering if we stay within the bounds of the current pixel.
+    // We use the value of 1 instead of 0.5 as an optimization.
+    // N.b.: our LoD selection algorithm is the same regardless of
+    // whether we integrate over the tangent plane or not, since we
+    // don't want the orientation of the tangent plane to create
+    // divergence of execution across the warp.
+    float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);
+
+    [branch] if (distScale == 0 || maxDistInPixels < 1)
+    {
+        #if SSS_DEBUG_LOD
+            _CameraColorTexture[pixelCoord] = float4(0, 0, 1, 1);
+        #else
+            _CameraColorTexture[pixelCoord] += float4(bsdfData.diffuseColor * centerIrradiance, 1);
+        #endif
+        return;
+    }
+
+    const bool useTangentPlane = SSS_USE_TANGENT_PLANE != 0;
+
+    float4x4 viewMatrix, projMatrix;
+    GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);
+
+    // Compute the tangent frame in view space.
+    float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);
+    float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;
+    float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;
+
+#if SSS_DEBUG_NORMAL_VS
+    // We expect the view-space normal to be front-facing.
+    if (normalVS.z >= 0) { _CameraColorTexture[pixelCoord] = float4(1, 0, 0, 1); return; }
+#endif
+
+    // Accumulate filtered irradiance and bilateral weights (for renormalization).
+    float3 totalIrradiance, totalWeight;
+
+    // Use fewer samples for SS regions smaller than 5x5 pixels (rotated by 45 degrees).
+    [branch] if (maxDistInPixels < SSS_LOD_THRESHOLD)
+    {
+        #if SSS_DEBUG_LOD
+            _CameraColorTexture[pixelCoord] = float4(0.5, 0.5, 0, 1); return;
+        #else
+            SSS_LOOP(SSS_N_SAMPLES_FAR_FIELD, _FilterKernelsFarField,
+                     profileID, shapeParam, pixelCoord, centerPosVS,
+                     useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,
+                     totalIrradiance, totalWeight)
+        #endif
+    }
+    else
+    {
+        #if SSS_DEBUG_LOD
+            _CameraColorTexture[pixelCoord] = float4(1, 0, 0, 1); return;
+        #else
+            SSS_LOOP(SSS_N_SAMPLES_NEAR_FIELD, _FilterKernelsNearField,
+                     profileID, shapeParam, pixelCoord, centerPosVS,
+                     useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,
+                     totalIrradiance, totalWeight)
+        #endif
+    }
+
+    _CameraColorTexture[pixelCoord] += float4(bsdfData.diffuseColor * totalIrradiance / totalWeight, 1);
+
+    // _CameraColorTexture[pixelCoord] = float4((mortonCode % 256) / 255.0, (groupId.x % 256) / 15.0, (groupId.y % 256) / 15.0, 1);
+    // _CameraColorTexture[pixelCoord] = float4((mortonCode % 256) / 255.0, 0, 0, 1);
+    // _CameraColorTexture[pixelCoord] = float4((pixelCoord.x % 256) / 255.0, (pixelCoord.y % 256) / 255.0, 0, 1);
+    // _CameraColorTexture[pixelCoord] = float4(numBorderTexelsPerWave == (16 + 18) ? 1 : 0, 0, 0, 1);
+}
+
+/*
+            float4 Frag(Varyings input) : SV_Target
+            {
+                PositionInputs posInput = GetPositionInput(input.positionCS.xy, _ScreenSize.zw);
+
+                float3 unused;
+
+                // Note: When we are in this SubsurfaceScattering shader we know that we are a SSS material. This shader is strongly coupled with the deferred Lit.shader.
+                // We can use the material classification facility to help the compiler to know we use SSS material and optimize the code (and don't require to read gbuffer with materialId).
+                uint featureFlags = MATERIALFEATUREFLAGS_LIT_SSS;
+
+                BSDFData bsdfData;
+                FETCH_GBUFFER(gbuffer, _GBufferTexture, posInput.unPositionSS);
+                DECODE_FROM_GBUFFER(gbuffer, featureFlags, bsdfData, unused);
+
+                int    profileID   = bsdfData.subsurfaceProfile;
+                float  distScale   = bsdfData.subsurfaceRadius;
+            #ifdef SSS_MODEL_DISNEY
+                float3 shapeParam  = _ShapeParams[profileID].rgb;
+                float  maxDistance = _ShapeParams[profileID].a;
+            #else
+                float  maxDistance = _FilterKernelsBasic[profileID][SSS_BASIC_N_SAMPLES - 1].a;
+            #endif
+
+                // Take the first (central) sample.
+                // TODO: copy its neighborhood into LDS.
+                float2 centerPosition   = posInput.unPositionSS;
+                float3 centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, centerPosition).rgb;
+
+                // Reconstruct the view-space position.
+                float2 centerPosSS = posInput.positionSS;
+                float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw;
+                float  centerDepth = LOAD_TEXTURE2D(_MainDepthTexture, centerPosition).r;
+                float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, _InvProjMatrix);
+                float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, _InvProjMatrix);
+
+            #ifdef SSS_MODEL_DISNEY
+                // Rescaling the filter is equivalent to inversely scaling the world.
+                float mmPerUnit  = MILLIMETERS_PER_METER * (_WorldScales[profileID] / distScale);
+                float unitsPerMm = rcp(mmPerUnit);
+
+                // Compute the view-space dimensions of the pixel as a quad projected onto geometry.
+                float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
+                float2 pixelsPerMm   = rcp(unitsPerPixel) * unitsPerMm;
+
+                // We perform point sampling. Therefore, we can avoid the cost
+                // of filtering if we stay within the bounds of the current pixel.
+                // We use the value of 1 instead of 0.5 as an optimization.
+                // N.b.: our LoD selection algorithm is the same regardless of
+                // whether we integrate over the tangent plane or not, since we
+                // don't want the orientation of the tangent plane to create
+                // divergence of execution across the warp.
+                float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);
+
+                [branch]
+                if (distScale == 0 || maxDistInPixels < 1)
+                {
+                    #if SSS_DEBUG_LOD
+                        return float4(0, 0, 1, 1);
+                    #else
+                        return float4(bsdfData.diffuseColor * centerIrradiance, 1);
+                    #endif
+                }
+
+                const bool useTangentPlane = SSS_USE_TANGENT_PLANE != 0;
+
+                float4x4 viewMatrix, projMatrix;
+                GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);
+
+                // Compute the tangent frame in view space.
+                float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);
+                float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;
+                float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;
+
+            #if SSS_DEBUG_NORMAL_VS
+                // We expect the view-space normal to be front-facing.
+                if (normalVS.z >= 0) return float4(1, 0, 0, 1);
+            #endif
+
+                // Accumulate filtered irradiance and bilateral weights (for renormalization).
+                float3 totalIrradiance, totalWeight;
+
+                // Use fewer samples for SS regions smaller than 5x5 pixels (rotated by 45 degrees).
+                [branch]
+                if (maxDistInPixels < SSS_LOD_THRESHOLD)
+                {
+                    #if SSS_DEBUG_LOD
+                        return float4(0.5, 0.5, 0, 1);
+                    #else
+                        SSS_LOOP(SSS_N_SAMPLES_FAR_FIELD, _FilterKernelsFarField,
+                                 profileID, shapeParam, centerPosition, centerPosVS,
+                                 useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,
+                                 totalIrradiance, totalWeight)
+                    #endif
+                }
+                else
+                {
+                    #if SSS_DEBUG_LOD
+                        return float4(1, 0, 0, 1);
+                    #else
+                        SSS_LOOP(SSS_N_SAMPLES_NEAR_FIELD, _FilterKernelsNearField,
+                                 profileID, shapeParam, centerPosition, centerPosVS,
+                                 useTangentPlane, tangentX, tangentY, mmPerUnit, pixelsPerMm,
+                                 totalIrradiance, totalWeight)
+                    #endif
+                }
+            #else
+                // Rescaling the filter is equivalent to inversely scaling the world.
+                float  metersPerUnit = _WorldScales[profileID] / distScale * SSS_BASIC_DISTANCE_SCALE;
+                float  centimPerUnit = CENTIMETERS_PER_METER * metersPerUnit;
+                // Compute the view-space dimensions of the pixel as a quad projected onto geometry.
+                float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
+                float2 pixelsPerCm   = rcp(centimPerUnit * unitsPerPixel);
+
+                // Compute the filtering direction.
+            #ifdef SSS_FILTER_HORIZONTAL_AND_COMBINE
+                float2 unitDirection = float2(1, 0);
+            #else
+                float2 unitDirection = float2(0, 1);
+            #endif
+
+                float2   scaledDirection  = pixelsPerCm * unitDirection;
+                float    phi              = 0; // Random rotation; unused for now
+                float2x2 rotationMatrix   = float2x2(cos(phi), -sin(phi), sin(phi), cos(phi));
+                float2   rotatedDirection = mul(rotationMatrix, scaledDirection);
+
+                // Load (1 / (2 * WeightedVariance)) for bilateral weighting.
+            #if RBG_BILATERAL_WEIGHTS
+                float3 halfRcpVariance = _HalfRcpWeightedVariances[profileID].rgb;
+            #else
+                float  halfRcpVariance = _HalfRcpWeightedVariances[profileID].a;
+            #endif
+
+            #ifndef SSS_FILTER_HORIZONTAL_AND_COMBINE
+                bsdfData.diffuseColor = float3(1, 1, 1);
+            #endif
+
+                // Take the first (central) sample.
+                float2 samplePosition   = posInput.unPositionSS;
+                float3 sampleWeight     = _FilterKernelsBasic[profileID][0].rgb;
+                float3 sampleIrradiance = LOAD_TEXTURE2D(_IrradianceSource, samplePosition).rgb;
+
+                // We perform point sampling. Therefore, we can avoid the cost
+                // of filtering if we stay within the bounds of the current pixel.
+                // We use the value of 1 instead of 0.5 as an optimization.
+                float maxDistInPixels = maxDistance * max(pixelsPerCm.x, pixelsPerCm.y);
+
+                [branch]
+                if (distScale == 0 || maxDistInPixels < 1)
+                {
+                    #if SSS_DEBUG_LOD
+                        return float4(0, 0, 1, 1);
+                    #else
+                        return float4(bsdfData.diffuseColor * sampleIrradiance, 1);
+                    #endif
+                }
+                
+                #if SSS_DEBUG_LOD
+                    return float4(0.5, 0.5, 0, 1);
+                #endif
+
+                // Accumulate filtered irradiance and bilateral weights (for renormalization).
+                float3 totalIrradiance = sampleWeight * sampleIrradiance;
+                float3 totalWeight     = sampleWeight;
+
+                [unroll]
+                for (int i = 1; i < SSS_BASIC_N_SAMPLES; i++)
+                {
+                    samplePosition   = posInput.unPositionSS + rotatedDirection * _FilterKernelsBasic[profileID][i].a;
+                    sampleWeight     = _FilterKernelsBasic[profileID][i].rgb;
+                    sampleIrradiance = LOAD_TEXTURE2D(_IrradianceSource, samplePosition).rgb;
+
+                    [flatten]
+                    if (any(sampleIrradiance))
+                    {
+                        // Apply bilateral weighting.
+                        // Ref #1: Skin Rendering by Pseudo–Separable Cross Bilateral Filtering.
+                        // Ref #2: Separable SSS, Supplementary Materials, Section E.
+                        float rawDepth    = LOAD_TEXTURE2D(_MainDepthTexture, samplePosition).r;
+                        float sampleDepth = LinearEyeDepth(rawDepth, _ZBufferParams);
+                        float zDistance   = centimPerUnit * sampleDepth - (centimPerUnit * centerPosVS.z);
+                        sampleWeight     *= exp(-zDistance * zDistance * halfRcpVariance);
+
+                        totalIrradiance += sampleWeight * sampleIrradiance;
+                        totalWeight     += sampleWeight;
+                    }
+                    else
+                    {
+                        // The irradiance is 0. This could happen for 3 reasons.
+                        // Most likely, the surface fragment does not have an SSS material.
+                        // Alternatively, our sample comes from a region without any geometry.
+                        // Finally, the surface fragment could be completely shadowed.
+                        // Our blur is energy-preserving, so 'centerWeight' should be set to 0.
+                        // We do not terminate the loop since we want to gather the contribution
+                        // of the remaining samples (e.g. in case of hair covering skin).
+                    }
+                }
+            #endif
+
+                return float4(bsdfData.diffuseColor * totalIrradiance / totalWeight, 1);
+            }
+            ENDHLSL
+        }
+    }
+    Fallback Off
+}
+
+*/
--- a/Assets/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute.meta
+++ b/Assets/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute.meta
+fileFormatVersion: 2
+guid: b06a7993621def248addd55d0fe931b1
+timeCreated: 1500310187
+licenseType: Pro
+ComputeShaderImporter:
+  externalObjects: {}
+  currentAPIMask: 4
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Assets/ScriptableRenderPipeline/ShaderLibrary/SpaceFillingCurves.hlsl
+++ b/Assets/ScriptableRenderPipeline/ShaderLibrary/SpaceFillingCurves.hlsl
+#ifndef UNITY_SPACE_FILLING_SURVES_INCLUDED
+#define UNITY_SPACE_FILLING_SURVES_INCLUDED
+
+// "Insert" a 0 bit after each of the 16 low bits of x.
+// Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+uint Part1By1(uint x)
+{
+  x &= 0x0000ffff;                  // x = ---- ---- ---- ---- fedc ba98 7654 3210
+  x = (x ^ (x <<  8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+  x = (x ^ (x <<  4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+  x = (x ^ (x <<  2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+  x = (x ^ (x <<  1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+  return x;
+}
+
+// "Insert" two 0 bits after each of the 10 low bits of x/
+// Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+uint Part1By2(uint x)
+{
+  x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+  x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+  x = (x ^ (x <<  8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+  x = (x ^ (x <<  4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+  x = (x ^ (x <<  2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+  return x;
+}
+
+// Inverse of Part1By1 - "delete" all odd-indexed bits/
+// Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+uint Compact1By1(uint x)
+{
+  x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+  x = (x ^ (x >>  1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+  x = (x ^ (x >>  2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+  x = (x ^ (x >>  4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+  x = (x ^ (x >>  8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
+  return x;
+}
+
+// Inverse of Part1By2 - "delete" all bits not at positions divisible by 3/
+// Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+uint Compact1By2(uint x)
+{
+  x &= 0x09249249;                  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+  x = (x ^ (x >>  2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+  x = (x ^ (x >>  4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+  x = (x ^ (x >>  8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+  x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
+  return x;
+}
+
+uint EncodeMorton2D(uint2 coord)
+{
+	return (Part1By1(coord.y) << 1) + Part1By1(coord.x);
+}
+
+uint EncodeMorton3D(uint3 coord)
+{
+	return (Part1By2(coord.z) << 2) + (Part1By2(coord.y) << 1) + Part1By2(coord.x);
+}
+
+uint2 DecodeMorton2D(uint code)
+{
+	return uint2(Compact1By1(code >> 0), Compact1By1(code >> 1));
+}
+
+uint3 DecodeMorton3D(uint code)
+{
+	return uint3(Compact1By2(code >> 0), Compact1By2(code >> 1), Compact1By2(code >> 2));
+}
+
+#endif // UNITY_SPACE_FILLING_SURVES_INCLUDED
--- a/Assets/ScriptableRenderPipeline/ShaderLibrary/SpaceFillingCurves.hlsl.meta
+++ b/Assets/ScriptableRenderPipeline/ShaderLibrary/SpaceFillingCurves.hlsl.meta
+fileFormatVersion: 2
+guid: 063144fddd2c1be41b9d09dec6314fc7
+timeCreated: 1500391830
+licenseType: Pro
+ShaderImporter:
+  externalObjects: {}
+  defaultTextures: []
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: