// =============== Convolves transmitted radiance with the Disney diffusion profile ================

//--------------------------------------------------------------------------------------------------
// Definitions
//--------------------------------------------------------------------------------------------------

// #pragma enable_d3d11_debug_symbols

// Tweak parameters.
#define SSS_BILATERAL_FILTER  1
#define SSS_USE_LDS_CACHE     1
#define SSS_TAA_INTEGRATION   1 // Smoother results at the cost of a tiny amount of flickering in under-sampled areas
#define SSS_ENABLE_NEAR_FIELD 0 // Greatly increases the number of samples. Comes at a high cost.
#define SSS_SAMPLE_TEST_HTILE 0 // Potential optimization. YMMV.
#define SSS_USE_TANGENT_PLANE 0 // Improves the accuracy of the approximation(0 -> 1st order). High cost. Does not work with back-facing normals.
#define SSS_CLAMP_ARTIFACT    0 // Reduces bleeding. Use with SSS_USE_TANGENT_PLANE.
#define SSS_DEBUG_LOD         0
#define SSS_DEBUG_NORMAL_VS   0

// Do not modify these.
#include "../../ShaderPass/ShaderPass.cs.hlsl"
#define SHADERPASS            SHADERPASS_SUBSURFACE_SCATTERING
#define GROUP_SIZE_1D         16
#define GROUP_SIZE_2D         (GROUP_SIZE_1D * GROUP_SIZE_1D)
#define TEXTURE_CACHE_BORDER  2
#define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER)

// Check for support of typed UAV loads from FORMAT_R16G16B16A16_FLOAT.
// TODO: query the format support more precisely.
#if !(defined(SHADER_API_PSSL) || defined(SHADER_API_XBOXONE))
#define USE_INTERMEDIATE_BUFFER
#endif

//--------------------------------------------------------------------------------------------------
// Included headers
//--------------------------------------------------------------------------------------------------

#include "ShaderLibrary/Common.hlsl"
#include "ShaderLibrary/Packing.hlsl"
#include "ShaderLibrary/Sampling/Sampling.hlsl"
#include "ShaderLibrary/SpaceFillingCurves.hlsl"
#include "../../ShaderVariables.hlsl"
#include "../../Lighting/LightDefinition.cs.hlsl"
#include "SubsurfaceScattering.hlsl"

//--------------------------------------------------------------------------------------------------
// Inputs & outputs
//--------------------------------------------------------------------------------------------------

float4 _FilterKernels[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD]; // XY = near field, ZW = far field; 0 = radius, 1 = reciprocal of the PDF

TEXTURE2D(_DepthTexture);                           // Z-buffer
TEXTURE2D(_SSSHTile);                                  // DXGI_FORMAT_R8_UINT is not supported by Unity
TEXTURE2D(_IrradianceSource);                       // Includes transmitted light

#ifdef USE_INTERMEDIATE_BUFFER
    RW_TEXTURE2D(float4, _CameraFilteringTexture);  // Target texture
#else
    RW_TEXTURE2D(float4, _CameraColorTexture);      // Target texture
#endif

//--------------------------------------------------------------------------------------------------
// Implementation
//--------------------------------------------------------------------------------------------------

// 6656 bytes used. It appears that the reserved LDS space must be a multiple of 512 bytes.
#if SSS_USE_LDS_CACHE
groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D * TEXTURE_CACHE_SIZE_1D]; // {irradiance, linearDepth}
#endif
groupshared bool   processGroup;

#if SSS_USE_LDS_CACHE
float4 LoadSampleFromCacheMemory(int2 cacheCoord)
{
    return textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)];
}
#endif

float4 LoadSampleFromVideoMemory(int2 pixelCoord)
{
    float3 irradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
    float  depth      = LOAD_TEXTURE2D(_DepthTexture,     pixelCoord).r;

    return float4(irradiance, LinearEyeDepth(depth, _ZBufferParams));
}

// Returns {irradiance, linearDepth}.
float4 LoadSample(int2 pixelCoord, int2 cacheOffset)
{
#if SSS_USE_LDS_CACHE
    int2 cacheCoord = pixelCoord - cacheOffset;
    bool isInCache  = max((uint)cacheCoord.x, (uint)cacheCoord.y) < TEXTURE_CACHE_SIZE_1D;

    [branch] if (isInCache)
    {
        return LoadSampleFromCacheMemory(cacheCoord);
    }
    else
#endif
    {
        // Always load both irradiance and depth.
        // Avoid dependent texture reads at the cost of extra bandwidth.
        return LoadSampleFromVideoMemory(pixelCoord);
    }
}

// Computes the value of the integrand in polar coordinates: f(r, s) = r * R(r, s).
// f(r, s) = (Exp[-r * s] + Exp[-r * s / 3]) * (s / (8 * Pi))
// We can drop the constant (s / (8 * Pi)) due to the subsequent weight renormalization.
float3 DisneyProfilePolar(float r, float3 S)
{
#if 0
    float3 expOneThird = exp(((-1.0 / 3.0) * r) * S);
#else
    // Help the compiler.
    float  k = (-1.0 / 3.0) * LOG2_E;
    float3 p = (k * r) * S;
    float3 expOneThird = exp2(p);
#endif
    return expOneThird + expOneThird * expOneThird * expOneThird;
}

// Computes f(r, s)/p(r, s), s.t. r = sqrt(xy^2 + z^2).
// Rescaling of the PDF is handled by 'totalWeight'.
float3 ComputeBilateralWeight(float xy2, float z, float mmPerUnit, float3 S, float rcpPdf)
{
#if (SSS_BILATERAL_FILTER == 0)
    z = 0;
#endif

#if SSS_USE_TANGENT_PLANE
    // Both 'xy2' and 'z' require conversion to millimeters.
    float r = sqrt(xy2 + z * z) * mmPerUnit;
#else
    // Only 'z' requires conversion to millimeters.
    float r = sqrt(xy2 + (z * mmPerUnit) * (z * mmPerUnit));
#endif

#if SSS_CLAMP_ARTIFACT
    return saturate(DisneyProfilePolar(r, S) * rcpPdf);
#else
    return DisneyProfilePolar(r, S) * rcpPdf;
#endif
}

void EvaluateSample(uint i, uint n, uint profileID, uint iR, uint iP, float2 centerCoord, int2 cacheOffset,
                    float3 shapeParam, float3 centerPosVS, float mmPerUnit, float2 pixelsPerMm,
                    float3 tangentX, float3 tangentY, float4x4 projMatrix,
                    inout float3 totalIrradiance, inout float3 totalWeight)
{
    float  r   = _FilterKernels[profileID][i][iR];
    // The relative sample position is known at the compile time.
    float  phi = SampleDiskFibonacci(i, n).y;
#if (SSS_TAA_INTEGRATION != 0)
    // Note that we repeat the pattern twice during the TAA cycle to reduce flickering.
    phi += VanDerCorputBase2(_TaaFrameIndex % 4) * TWO_PI;
#endif
    float2 vec = r * float2(cos(phi), sin(phi));

    // Compute the screen-space position and the squared distance (in mm) in the image plane.
    int2 position; float xy2;

    #if SSS_USE_TANGENT_PLANE
        float3 relPosVS   = vec.x * tangentX + vec.y * tangentY;
        float3 positionVS = centerPosVS + relPosVS;
        float2 positionNDC = ComputeNormalizedDeviceCoordinates(positionCS, projMatrix);

        position = (int2)(positionNDC * _ScreenSize.xy);
        xy2      = dot(relPosVS.xy, relPosVS.xy);
    #else
        position = (int2)(centerCoord + vec * pixelsPerMm);
        xy2      = r * r;
    #endif

    float4 textureSample = LoadSample(position, cacheOffset);
    float3 irradiance    = textureSample.rgb;

    // Check the results of the stencil test.
    if (TestLightingForSSS(irradiance))
    {
        // Apply bilateral weighting.
        float  viewZ  = textureSample.a;
        float  relZ   = viewZ - centerPosVS.z;
        float  rcpPdf = _FilterKernels[profileID][i][iP];
        float3 weight = ComputeBilateralWeight(xy2, relZ, mmPerUnit, shapeParam, rcpPdf);

        // Note: if the texture sample if off-screen, (z = 0) -> (viewZ = far) -> (weight ≈ 0).
        totalIrradiance += weight * irradiance;
        totalWeight     += weight;
    }
    else
    {
        // The irradiance is 0. This could happen for 2 reasons.
        // Most likely, the surface fragment does not have an SSS material.
        // Alternatively, our sample comes from a region without any geometry.
        // Our blur is energy-preserving, so 'centerWeight' should be set to 0.
        // We do not terminate the loop since we want to gather the contribution
        // of the remaining samples (e.g. in case of hair covering skin).
    }
}

void StoreResult(uint2 pixelCoord, float3 irradiance)
{
#ifdef USE_INTERMEDIATE_BUFFER
    _CameraFilteringTexture[pixelCoord] = float4(irradiance, 1);
#else
    _CameraColorTexture[pixelCoord]    += float4(irradiance, 0);
#endif
}

#pragma kernel SubsurfaceScattering

[numthreads(GROUP_SIZE_2D, 1, 1)]
void SubsurfaceScattering(uint3 reorderedGroupId : SV_GroupID,
                          uint  groupThreadId    : SV_GroupThreadID)
{
    // Note: any factor of 64 is a suitable wave size for our algorithm.
    uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
    uint laneIndex = groupThreadId % 64;
    uint quadIndex = laneIndex / 4;

    // We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
    // Therefore, we need to reorder. TODO: macrotile order.
    uint2 groupQuad = DeinterleaveQuad(reorderedGroupId.x);
    uint2 groupId   = uint2(reorderedGroupId.y * 2 + groupQuad.x, reorderedGroupId.z * 2 + groupQuad.y);

    // Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.
    uint2 groupCoord  = DecodeMorton2D(groupThreadId);
    uint2 groupOffset = groupId * GROUP_SIZE_1D;
    uint2 pixelCoord  = groupOffset + groupCoord;
    int2  cacheOffset = (int2)groupOffset - TEXTURE_CACHE_BORDER;

    [branch] if (groupThreadId == 0)
    {
        float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;

        // Check whether the thread group needs to perform any work.
        float s00 = LOAD_TEXTURE2D(_SSSHTile, 2 * groupId + uint2(0, 0)).r;
        float s10 = LOAD_TEXTURE2D(_SSSHTile, 2 * groupId + uint2(1, 0)).r;
        float s01 = LOAD_TEXTURE2D(_SSSHTile, 2 * groupId + uint2(0, 1)).r;
        float s11 = LOAD_TEXTURE2D(_SSSHTile, 2 * groupId + uint2(1, 1)).r;

        // Perform the stencil test (reject at the tile rate).
        processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);
    }

    // Wait for the LDS.
    GroupMemoryBarrierWithGroupSync();

    [branch] if (!processGroup) { return; }

    float3 centerIrradiance  = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
    float  centerDepth       = 0;
    float  centerViewZ       = 0;
    bool   passedStencilTest = TestLightingForSSS(centerIrradiance);

    // Save some bandwidth by only loading depth values for SSS pixels.
    [branch] if (passedStencilTest)
    {
        centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
        centerViewZ = LinearEyeDepth(centerDepth, _ZBufferParams);
    }

#if SSS_USE_LDS_CACHE
    uint2 cacheCoord = groupCoord + TEXTURE_CACHE_BORDER;
    // Populate the central region of the LDS cache.
    textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = float4(centerIrradiance, centerViewZ);

    uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;
    uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;

    [branch] if (quadIndex < numBorderQuadsPerWave)
    {
        // Fetch another texel into the LDS.
        uint2 startQuad = halfCacheWidthInQuads * DeinterleaveQuad(waveIndex);

        uint2 quadCoord;

        // The traversal order is such that the quad's X coordinate is monotonically increasing.
        // The corner is always the near the block of the corresponding wavefront.
        // Note: the compiler can heavily optimize the code below, as the switch is scalar,
        // and there are very few unique values due to the symmetry.
        switch (waveIndex)
        {
            case 0:  // Bottom left
                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
                quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex));
                break;
            case 1:  // Bottom right
                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
                quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
                break;
            case 2:  // Top left
                quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
                quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1);
                break;
            default: // Top right
                quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
                quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex);
                break;
        }

        uint2  cacheCoord2 = 2 * (startQuad + quadCoord) + DeinterleaveQuad(laneIndex);
        int2   pixelCoord2 = (int2)(groupOffset + cacheCoord2) - TEXTURE_CACHE_BORDER;
        float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
        float  viewZ2      = 0;

        // Save some bandwidth by only loading depth values for SSS pixels.
        [branch] if (TestLightingForSSS(irradiance2))
        {
            viewZ2 = LinearEyeDepth(LOAD_TEXTURE2D(_DepthTexture, pixelCoord2).r, _ZBufferParams);
        }

        // Populate the border region of the LDS cache.
        textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = float4(irradiance2, viewZ2);
    }

    // Wait for the LDS.
    GroupMemoryBarrierWithGroupSync();
#endif

    [branch] if (!passedStencilTest) { return; }

    PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw);

    // The result of the stencil test allows us to statically determine the material type (SSS).
    SSSData sssData;
    DECODE_FROM_SSSBUFFER(posInput.positionSS, sssData);

    int    profileID   = sssData.subsurfaceProfile;
    float  distScale   = sssData.subsurfaceRadius;
    float3 shapeParam  = _ShapeParams[profileID].rgb;
    float  maxDistance = _ShapeParams[profileID].a;

    // Reconstruct the view-space position corresponding to the central sample.
    float2 centerPosNDC = posInput.positionNDC;
    float2 cornerPosNDC = centerPosNDC + 0.5 * _ScreenSize.zw;
    float3 centerPosVS  = ComputeViewSpacePosition(centerPosNDC, centerDepth, UNITY_MATRIX_I_P);
    float3 cornerPosVS  = ComputeViewSpacePosition(cornerPosNDC, centerDepth, UNITY_MATRIX_I_P);

    // Rescaling the filter is equivalent to inversely scaling the world.
    float mmPerUnit  = MILLIMETERS_PER_METER * (_WorldScales[profileID].x / distScale);
    float unitsPerMm = rcp(mmPerUnit);

    // Compute the view-space dimensions of the pixel as a quad projected onto geometry.
    float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
    float2 pixelsPerMm   = rcp(unitsPerPixel) * unitsPerMm;

    // We perform point sampling. Therefore, we can avoid the cost
    // of filtering if we stay within the bounds of the current pixel.
    // We use the value of 1 instead of 0.5 as an optimization.
    // N.b.: our LoD selection algorithm is the same regardless of
    // whether we integrate over the tangent plane or not, since we
    // don't want the orientation of the tangent plane to create
    // divergence of execution across the warp.
    float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);

    float3 albedo = ApplyDiffuseTexturingMode(sssData.diffuseColor, profileID);

    [branch] if (distScale == 0 || maxDistInPixels < 1)
    {
        #if SSS_DEBUG_LOD
            StoreResult(pixelCoord, float3(0, 0, 1));
        #else
            StoreResult(pixelCoord, albedo * centerIrradiance);
        #endif
            return;
    }

    float4x4 viewMatrix, projMatrix;
    GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);

    // TODO: Since we have moved to forward SSS, we don't support anymore a bsdfData.normalWS.
    // Once we include normal+roughness rendering during the prepass, we will have a buffer to bind here and we will be able to reuse this part of the algorithm on demand.
#if SSS_USE_TANGENT_PLANE
    #error ThisWillNotCompile_SeeComment
    // Compute the tangent frame in view space.
    float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);
    float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;
    float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;
#else
    float3 normalVS = float3(0, 0, 0);
    float3 tangentX = float3(0, 0, 0);
    float3 tangentY = float3(0, 0, 0);
#endif

#if SSS_DEBUG_NORMAL_VS
    // We expect the normal to be front-facing.
    float3 viewDirVS = normalize(centerPosVS);
    if (dot(normalVS, viewDirVS) >= 0)
    {
        StoreResult(pixelCoord, float3(1, 1, 1));
        return;
    }
#endif

    // Use more samples for SS regions larger than 5x5 pixels (rotated by 45 degrees).
    bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD;

#if SSS_DEBUG_LOD
    StoreResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0));
    return;
#endif

    // Compute the indices used to access the individual components of the float4 of the kernel.
    uint iR = useNearFieldKernel ? 0 : 2; // radius
    uint iP = useNearFieldKernel ? 1 : 3; // rcp(pdf)

    float  centerRadius = _FilterKernels[profileID][0][iR];
    float  centerRcpPdf = _FilterKernels[profileID][0][iP];
    float3 centerWeight = DisneyProfilePolar(centerRadius, shapeParam) * centerRcpPdf;

    // Accumulate filtered irradiance and bilateral weights (for renormalization).
    float3 totalIrradiance = centerWeight * centerIrradiance;
    float3 totalWeight     = centerWeight;

    int i, n; // Declare once to avoid the warning from the Unity shader compiler.

    [unroll]
    for (i = 1, n = SSS_N_SAMPLES_FAR_FIELD; i < n; i++)
    {
        // Integrate over the image or tangent plane in the view space.
        EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheOffset,
                       shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
                       tangentX, tangentY, projMatrix,
                       totalIrradiance, totalWeight);
    }

    [branch] if (!useNearFieldKernel)
    {
         StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);
         return;
    }

    [unroll]
    for (i = SSS_N_SAMPLES_FAR_FIELD, n = SSS_N_SAMPLES_NEAR_FIELD; i < n; i++)
    {
        // Integrate over the image or tangent plane in the view space.
        EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheOffset,
                       shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
                       tangentX, tangentY, projMatrix,
                       totalIrradiance, totalWeight);
    }

    StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);
}