ScriptableRenderPipeline/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute


								// =============== Convolves transmitted radiance with the Disney diffusion profile ================


								//--------------------------------------------------------------------------------------------------

								// Definitions

								//--------------------------------------------------------------------------------------------------


								// #pragma enable_d3d11_debug_symbols


								// Tweak parameters.

								#define SSS_BILATERAL_FILTER  1

								#define SSS_USE_LDS_CACHE     1

								#define SSS_ENABLE_NEAR_FIELD 0 // Greatly increases the number of samples. Comes at a high cost.

								#define SSS_SAMPLE_TEST_HTILE 0 // Potential optimization. YMMV.

								#define SSS_USE_TANGENT_PLANE 0 // Improves the accuracy of the approximation(0 -> 1st order). High cost. Does not work with back-facing normals.

								#define SSS_CLAMP_ARTIFACT    0 // Reduces bleeding. Use with SSS_USE_TANGENT_PLANE.

								#define SSS_DEBUG_LOD         0

								#define SSS_DEBUG_NORMAL_VS   0


								// Do not modify these.

								#include "../../../ShaderPass/ShaderPass.cs.hlsl"

								#define SHADERPASS            SHADERPASS_SUBSURFACE_SCATTERING

								#define GROUP_SIZE_1D         16

								#define GROUP_SIZE_2D         (GROUP_SIZE_1D * GROUP_SIZE_1D)

								#define TEXTURE_CACHE_BORDER  2

								#define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER)


								// Check for support of typed UAV loads from FORMAT_R16G16B16A16_FLOAT.

								// TODO: query the format support more precisely.

								#if !(defined(SHADER_API_PSSL) || defined(SHADER_API_XBOXONE))

								#define USE_INTERMEDIATE_BUFFER

								#endif


								//--------------------------------------------------------------------------------------------------

								// Included headers

								//--------------------------------------------------------------------------------------------------


								#include "ShaderLibrary/Common.hlsl"

								#include "ShaderLibrary/Packing.hlsl"

								#include "ShaderLibrary\Fibonacci.hlsl"

								#include "ShaderLibrary/SpaceFillingCurves.hlsl"

								#include "../../../ShaderVariables.hlsl"

								#include "../../../Lighting/LightDefinition.cs.hlsl"

								#include "../../SubsurfaceScattering/SubsurfaceScattering.hlsl"


								//--------------------------------------------------------------------------------------------------

								// Inputs & outputs

								//--------------------------------------------------------------------------------------------------


								float4 _FilterKernels[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD]; // XY = near field, ZW = far field; 0 = radius, 1 = reciprocal of the PDF


								TEXTURE2D(_DepthTexture);                           // Z-buffer

								TEXTURE2D(_HTile);                                  // DXGI_FORMAT_R8_UINT is not supported by Unity

								TEXTURE2D(_IrradianceSource);                       // Includes transmitted light


								#ifdef USE_INTERMEDIATE_BUFFER

								    RW_TEXTURE2D(float4, _CameraFilteringTexture);  // Target texture

								#else

								    RW_TEXTURE2D(float4, _CameraColorTexture);      // Target texture

								#endif


								//--------------------------------------------------------------------------------------------------

								// Implementation

								//--------------------------------------------------------------------------------------------------


								// 6656 bytes used. It appears that the reserved LDS space must be a multiple of 512 bytes.

								#if SSS_USE_LDS_CACHE

								groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D * TEXTURE_CACHE_SIZE_1D]; // {irradiance, linearDepth}

								#endif

								groupshared bool   processGroup;


								#if SSS_USE_LDS_CACHE

								float4 LoadSampleFromCacheMemory(int2 cacheCoord)

								{

								    return textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)];

								}

								#endif


								float4 LoadSampleFromVideoMemory(int2 pixelCoord)

								{

								    float3 irradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;

								    float  depth      = LOAD_TEXTURE2D(_DepthTexture,     pixelCoord).r;


								    return float4(irradiance, LinearEyeDepth(depth, _ZBufferParams));

								}


								// Returns {irradiance, linearDepth}.

								float4 LoadSample(int2 pixelCoord, int2 cacheAnchor)

								{

								#if SSS_USE_LDS_CACHE

								    int2 cacheCoord = pixelCoord - cacheAnchor;

								    bool isInCache  = max((uint)cacheCoord.x, (uint)cacheCoord.y) < TEXTURE_CACHE_SIZE_1D;


								    [branch] if (isInCache)

								    {

								        return LoadSampleFromCacheMemory(cacheCoord);

								    }

								    else

								#endif

								    {

								        // Always load both irradiance and depth.

								        // Avoid dependent texture reads at the cost of extra bandwidth.

								        return LoadSampleFromVideoMemory(pixelCoord);

								    }

								}


								// Computes the value of the integrand in polar coordinates: f(r, s) = r * R(r, s).

								// f(r, s) = (Exp[-r * s] + Exp[-r * s / 3]) * (s / (8 * Pi))

								// We can drop the constant (s / (8 * Pi)) due to the subsequent weight renormalization.

								float3 DisneyProfilePolar(float r, float3 S)

								{

								#if 0

								    float3 expOneThird = exp(((-1.0 / 3.0) * r) * S);

								#else

								    // Help the compiler.

								    float  k = (-1.0 / 3.0) * LOG2_E;

								    float3 p = (k * r) * S;

								    float3 expOneThird = exp2(p);

								#endif

								    return expOneThird + expOneThird * expOneThird * expOneThird;

								}


								// Computes f(r, s)/p(r, s), s.t. r = sqrt(xy^2 + z^2).

								// Rescaling of the PDF is handled by 'totalWeight'.

								float3 ComputeBilateralWeight(float xy2, float z, float mmPerUnit, float3 S, float rcpPdf)

								{

								#if (SSS_BILATERAL_FILTER == 0)

								    z = 0;

								#endif


								#if SSS_USE_TANGENT_PLANE

								    // Both 'xy2' and 'z' require conversion to millimeters.

								    float r = sqrt(xy2 + z * z) * mmPerUnit;

								#else

								    // Only 'z' requires conversion to millimeters.

								    float r = sqrt(xy2 + (z * mmPerUnit) * (z * mmPerUnit));

								#endif


								#if SSS_CLAMP_ARTIFACT

								    return saturate(DisneyProfilePolar(r, S) * rcpPdf);

								#else

								    return DisneyProfilePolar(r, S) * rcpPdf;

								#endif

								}


								void EvaluateSample(uint i, uint n, uint profileID, uint iR, uint iP, float2 centerCoord, int2 cacheAnchor,

								                    float3 shapeParam, float3 centerPosVS, float mmPerUnit, float2 pixelsPerMm,

								                    float3 tangentX, float3 tangentY, float4x4 projMatrix,

								                    inout float3 totalIrradiance, inout float3 totalWeight)

								{

								    float  r   = _FilterKernels[profileID][i][iR];

								    // The relative sample position is known at the compile time.

								    float  phi = SampleDiskFibonacci(i, n).y;

								    float2 vec = r * float2(cos(phi), sin(phi));


								    // Compute the screen-space position and the squared distance (in mm) in the image plane.

								    int2 position; float xy2;


								    #if SSS_USE_TANGENT_PLANE

								        float3 relPosVS   = vec.x * tangentX + vec.y * tangentY;

								        float3 positionVS = centerPosVS + relPosVS;

								        float2 positionNDC = ComputeNormalizedDeviceCoordinates(positionCS, projMatrix);


								        position = (int2)(positionNDC * _ScreenSize.xy);

								        xy2      = dot(relPosVS.xy, relPosVS.xy);

								    #else

								        position = (int2)(centerCoord + vec * pixelsPerMm);

								        xy2      = r * r;

								    #endif


								    float4 textureSample = LoadSample(position, cacheAnchor);

								    float3 irradiance    = textureSample.rgb;


								    // Check the results of the stencil test.

								    if (TestLightingForSSS(irradiance))

								    {

								        // Apply bilateral weighting.

								        float  viewZ  = textureSample.a;

								        float  relZ   = viewZ - centerPosVS.z;

								        float  rcpPdf = _FilterKernels[profileID][i][iP];

								        float3 weight = ComputeBilateralWeight(xy2, relZ, mmPerUnit, shapeParam, rcpPdf);


								        totalIrradiance += weight * irradiance;

								        totalWeight     += weight;

								    }

								    else

								    {

								        // The irradiance is 0. This could happen for 2 reasons.

								        // Most likely, the surface fragment does not have an SSS material.

								        // Alternatively, our sample comes from a region without any geometry.

								        // Our blur is energy-preserving, so 'centerWeight' should be set to 0.

								        // We do not terminate the loop since we want to gather the contribution

								        // of the remaining samples (e.g. in case of hair covering skin).

								    }

								}


								void StoreResult(uint2 pixelCoord, float3 irradiance)

								{

								#ifdef USE_INTERMEDIATE_BUFFER

								    _CameraFilteringTexture[pixelCoord] = float4(irradiance, 1);

								#else

								    _CameraColorTexture[pixelCoord]    += float4(irradiance, 0);

								#endif

								}


								#pragma kernel SubsurfaceScattering


								[numthreads(GROUP_SIZE_2D, 1, 1)]

								void SubsurfaceScattering(uint2 groupId       : SV_GroupID,

								                          uint  groupThreadId : SV_GroupThreadID)

								{

								    // Note: any factor of 64 is a suitable wave size for our algorithm.

								    uint waveIndex = WaveReadFirstLane(groupThreadId / 64);

								    uint laneIndex = groupThreadId % 64;

								    uint quadIndex = laneIndex / 4;


								    // Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.

								    uint  mortonCode  = groupThreadId;

								    uint2 localCoord  = DecodeMorton2D(mortonCode);

								    uint2 tileAnchor  = groupId * GROUP_SIZE_1D;

								    uint2 pixelCoord  = tileAnchor + localCoord;

								    int2  cacheAnchor = (int2)tileAnchor - TEXTURE_CACHE_BORDER;

								    uint2 cacheCoord  = localCoord + TEXTURE_CACHE_BORDER;

								    float stencilRef  = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;


								    [branch] if (groupThreadId == 0)

								    {

								        // Check whether the thread group needs to perform any work.

								        float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;

								        float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;

								        float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;

								        float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;


								        // Perform the stencil test (reject at the tile rate).

								        processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);

								    }


								    // Wait for the LDS.

								    GroupMemoryBarrierWithGroupSync();


								    [branch] if (!processGroup) { return; }


								    float3 centerIrradiance  = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;

								    float  centerDepth       = 0;

								    float  centerViewZ       = 0;

								    bool   passedStencilTest = TestLightingForSSS(centerIrradiance);


								    // Save some bandwidth by only loading depth values for SSS pixels.

								    [branch] if (passedStencilTest)

								    {

								        centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;

								        centerViewZ = LinearEyeDepth(centerDepth, _ZBufferParams);

								    }


								#if SSS_USE_LDS_CACHE

								    // Populate the central region of the LDS cache.

								    textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = float4(centerIrradiance, centerViewZ);


								    uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;

								    uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;


								    [branch] if (quadIndex < numBorderQuadsPerWave)

								    {

								        // Fetch another texel into the LDS.

								        uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1);


								        uint2 quadCoord;


								        // The traversal order is such that the quad's X coordinate is monotonically increasing.

								        // The corner is always the near the block of the corresponding wavefront.

								        // Note: the compiler can heavily optimize the code below, as the switch is scalar,

								        // and there are very few unique values due to the symmetry.

								        switch (waveIndex)

								        {

								            case 0:  // Bottom left

								                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));

								                quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex));

								                break;

								            case 1:  // Bottom right

								                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);

								                quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));

								                break;

								            case 2:  // Top left

								                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));

								                quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1);

								                break;

								            default: // Top right

								                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);

								                quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex);

								                break;

								        }


								        uint2  cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);

								        int2   pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;

								        float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;

								        float  viewZ2      = 0;


								        // Save some bandwidth by only loading depth values for SSS pixels.

								        [branch] if (TestLightingForSSS(irradiance2))

								        {

								            viewZ2 = LinearEyeDepth(LOAD_TEXTURE2D(_DepthTexture, pixelCoord2).r, _ZBufferParams);

								        }


								        // Populate the border region of the LDS cache.

								        textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = float4(irradiance2, viewZ2);

								    }


								    // Wait for the LDS.

								    GroupMemoryBarrierWithGroupSync();

								#endif


								    bool isOffScreen = pixelCoord.x >= (uint)_ScreenSize.x || pixelCoord.y >= (uint)_ScreenSize.y;


								    [branch] if (!passedStencilTest || isOffScreen) { return; }


								    PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw);


								    // The result of the stencil test allows us to statically determine the material type (SSS).

								    SSSData sssData;

								    DECODE_FROM_SSSBUFFER(posInput.positionSS, sssData);


								    int    profileID   = sssData.subsurfaceProfile;

								    float  distScale   = sssData.subsurfaceRadius;

								    float3 shapeParam  = _ShapeParams[profileID].rgb;

								    float  maxDistance = _ShapeParams[profileID].a;


								    // Reconstruct the view-space position corresponding to the central sample.

								    float2 centerPosSS = posInput.positionNDC;

								    float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw;

								    float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, UNITY_MATRIX_I_P);

								    float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, UNITY_MATRIX_I_P);


								    // Rescaling the filter is equivalent to inversely scaling the world.

								    float mmPerUnit  = MILLIMETERS_PER_METER * (_WorldScales[profileID].x / distScale);

								    float unitsPerMm = rcp(mmPerUnit);


								    // Compute the view-space dimensions of the pixel as a quad projected onto geometry.

								    float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);

								    float2 pixelsPerMm   = rcp(unitsPerPixel) * unitsPerMm;


								    // We perform point sampling. Therefore, we can avoid the cost

								    // of filtering if we stay within the bounds of the current pixel.

								    // We use the value of 1 instead of 0.5 as an optimization.

								    // N.b.: our LoD selection algorithm is the same regardless of

								    // whether we integrate over the tangent plane or not, since we

								    // don't want the orientation of the tangent plane to create

								    // divergence of execution across the warp.

								    float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);


								    float3 albedo = ApplyDiffuseTexturingMode(sssData.diffuseColor, profileID);


								    [branch] if (distScale == 0 || maxDistInPixels < 1)

								    {

								        #if SSS_DEBUG_LOD

								            StoreResult(pixelCoord, float3(0, 0, 1));

								        #else

								            StoreResult(pixelCoord, albedo * centerIrradiance);

								        #endif

								            return;

								    }


								    float4x4 viewMatrix, projMatrix;

								    GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);


								    // TODO: Since we have moved to forward SSS, we don't support anymore a bsdfData.normalWS.

								    // Once we include normal+roughness rendering during the prepass, we will have a buffer to bind here and we will be able to reuse this part of the algorithm on demand.

								#if SSS_USE_TANGENT_PLANE

								    #error ThisWillNotCompile_SeeComment

								    // Compute the tangent frame in view space.

								    float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);

								    float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;

								    float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;

								#else

								    float3 normalVS = float3(0, 0, 0);

								    float3 tangentX = float3(0, 0, 0);

								    float3 tangentY = float3(0, 0, 0);

								#endif


								#if SSS_DEBUG_NORMAL_VS

								    // We expect the normal to be front-facing.

								    float3 viewDirVS = normalize(centerPosVS);

								    if (dot(normalVS, viewDirVS) >= 0)

								    {

								        StoreResult(pixelCoord, float3(1, 1, 1));

								        return;

								    }

								#endif


								    // Use more samples for SS regions larger than 5x5 pixels (rotated by 45 degrees).

								    bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD;


								#if SSS_DEBUG_LOD

								    StoreResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0));

								    return;

								#endif


								    // Compute the indices used to access the individual components of the float4 of the kernel.

								    uint iR = useNearFieldKernel ? 0 : 2; // radius

								    uint iP = useNearFieldKernel ? 1 : 3; // rcp(pdf)


								    float  centerRadius = _FilterKernels[profileID][0][iR];

								    float  centerRcpPdf = _FilterKernels[profileID][0][iP];

								    float3 centerWeight = DisneyProfilePolar(centerRadius, shapeParam) * centerRcpPdf;


								    // Accumulate filtered irradiance and bilateral weights (for renormalization).

								    float3 totalIrradiance = centerWeight * centerIrradiance;

								    float3 totalWeight     = centerWeight;


								    int i, n; // Declare once to avoid the warning from the Unity shader compiler.


								    [unroll]

								    for (i = 1, n = SSS_N_SAMPLES_FAR_FIELD; i < n; i++)

								    {

								        // Integrate over the image or tangent plane in the view space.

								        EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor,

								                       shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,

								                       tangentX, tangentY, projMatrix,

								                       totalIrradiance, totalWeight);

								    }


								    [branch] if (!useNearFieldKernel)

								    {

								         StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);

								         return;

								    }


								    [unroll]

								    for (i = SSS_N_SAMPLES_FAR_FIELD, n = SSS_N_SAMPLES_NEAR_FIELD; i < n; i++)

								    {

								        // Integrate over the image or tangent plane in the view space.

								        EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor,

								                       shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,

								                       tangentX, tangentY, projMatrix,

								                       totalIrradiance, totalWeight);

								    }


								    StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);

								}