//--------------------------------------------------------------------------------------------------
// Definitions
//--------------------------------------------------------------------------------------------------

#pragma kernel VolumetricLightingAllLights VolumetricLighting=VolumetricLightingAllLights LIGHTLOOP_SINGLE_PASS
#pragma kernel VolumetricLightingClustered VolumetricLighting=VolumetricLightingClustered LIGHTLOOP_TILE_PASS   USE_CLUSTERED_LIGHTLIST

#pragma enable_d3d11_debug_symbols

#include "../../../ShaderPass/ShaderPass.cs.hlsl"
#define SHADERPASS    SHADERPASS_VOLUMETRIC_LIGHTING
#define GROUP_SIZE_1D 16
#define GROUP_SIZE_2D (GROUP_SIZE_1D * GROUP_SIZE_1D)

//--------------------------------------------------------------------------------------------------
// Included headers
//--------------------------------------------------------------------------------------------------

#include "../../../../Core/ShaderLibrary/Common.hlsl"
#include "../../../../Core/ShaderLibrary/SpaceFillingCurves.hlsl"
#include "../../../../Core/ShaderLibrary/VolumeRendering.hlsl"

#include "../VolumetricLighting.cs.hlsl"
#define UNITY_MATERIAL_LIT // Need to be defined before including Material.hlsl
#include "../../../ShaderVariables.hlsl"
#include "../../../Lighting/Lighting.hlsl" // This includes Material.hlsl

//--------------------------------------------------------------------------------------------------
// Inputs & outputs
//--------------------------------------------------------------------------------------------------

RW_TEXTURE3D(float4, _VBufferLighting); // RGB = radiance, A = optical depth
TEXTURE3D(_VBufferLightingPrev);        // RGB = radiance, A = optical depth

CBUFFER_START(UnityVolumetricLighting)
    float4x4 _VBufferCoordToViewDirWS; // Actually just 3x3, but Unity can only set 4x4
CBUFFER_END

//--------------------------------------------------------------------------------------------------
// Implementation
//--------------------------------------------------------------------------------------------------

struct Ray
{
    float3 originWS;
    float3 directionWS; // Normalized
    float  ratioLenToZ; // 1 / ViewSpaceZ
};

float3 GetPointAtDistance(Ray ray, float t)
{
    return ray.originWS + t * ray.directionWS;
}

// Computes the in-scattered radiance along the ray.
void FillVolumetricLightingBuffer(Ray ray, uint2 voxelCoord, uint2 tileCoord)
{
    LightLoopContext context;
    // ZERO_INITIALIZE(LightLoopContext, context);
    context.shadowContext = InitShadowContext();

    uint   featureFlags = 0xFFFFFFFF;       // TODO
    float4 depthParams  = _VBufferDepthEncodingParams;

    float z0 = depthParams.x;               // View space Z coordinate of the near plane
    float t0 = z0 * ray.ratioLenToZ;        // Distance to the near plane
    float de = rcp(VBUFFER_SLICE_COUNT);    // Log-encoded distance between slices

    float3 totalRadiance = 0;
    float  opticalDepth  = 0;

    uint sliceCountHack = max(VBUFFER_SLICE_COUNT, (uint)z0); // Prevent unrolling...

#ifdef LIGHTLOOP_TILE_PASS
    // Our voxel is not necessarily completely inside a single light cluster.
    uint  clusterIndices[2];
    float clusterDepths[2];
    // TODO: the clustered code could be made faster & simpler.
    clusterIndices[0] = GetLightClusterIndex(tileCoord, z0);
    clusterDepths[0]  = GetLightClusterMinDepthVS(tileCoord, clusterIndices[0]);
#endif // LIGHTLOOP_TILE_PASS

    // TODO: replace 'sliceCountHack' with VBUFFER_SLICE_COUNT when the shader compiler bug is fixed.
    for (uint slice = 0; slice < sliceCountHack; slice++)
    {
        float e1 = slice * de + de; // (slice + 1) / sliceCount
        float z1 = DecodeLogarithmicDepth(e1, depthParams);
        float t1 = ray.ratioLenToZ * z1;
        float dt = t1 - t0;

        // Compute the position of the center of the voxel.
        // We will use it for participating media sampling and reprojection.
        float  tc       = t0 + 0.5 * dt;
        float3 centerWS = GetPointAtDistance(ray, tc);

        // Sample the participating medium at 'tc' (or 'centerWS').
        // We consider it to be constant along the interval [t0, t1] (within the voxel).
        float3 scattering = _GlobalFog_Scattering;
        float  extinction = _GlobalFog_Extinction;

        // TODO: use a low-discrepancy point set.
        float rndVal = 0.5;

        float3 sampleRadiance = 0;

        if (featureFlags & LIGHTFEATUREFLAGS_DIRECTIONAL)
        {
            float tOffset, weight;
            ImportanceSampleHomogeneousMedium(rndVal, extinction, dt, tOffset, weight);

            float t = t0 + tOffset;
            float3 positionWS = GetPointAtDistance(ray, t);

            for (uint i = 0; i < _DirectionalLightCount; ++i)
            {
                // Fetch the light.
                DirectionalLightData lightData = _DirectionalLightDatas[i];

                float3 L         = -lightData.forward; // Lights point backwards in Unity
                float  intensity = weight;
                float3 color     = lightData.color;

                [branch] if (lightData.shadowIndex >= 0)
                {
                    float shadow = GetDirectionalShadowAttenuation(context.shadowContext, positionWS,
                                   float3(0, 0, 0), lightData.shadowIndex, L);

                    intensity *= shadow;
                }

                // Note: no fog attenuation along shadow rays for directional lights.

                [branch] if (lightData.cookieIndex >= 0)
                {
                    float3 lightToSample = positionWS - lightData.positionWS;
                    float4 cookie = EvaluateCookie_Directional(context, lightData, lightToSample);

                    color     *= cookie.rgb;
                    intensity *= cookie.a;
                }

                // Compute the amount of in-scattered radiance.
                sampleRadiance += color * intensity;
            }
        }

    #ifdef LIGHTLOOP_TILE_PASS
        // TODO: the clustered code could be made faster & simpler.
        clusterIndices[1] = GetLightClusterIndex(tileCoord, z1);
        clusterDepths[1]  = GetLightClusterMinDepthVS(tileCoord, clusterIndices[1]);

        // Loop over 1 or 2 light clusters.
        for (int cluster = 0; cluster < 2; cluster++)
        {
            float tMin = max(t0, clusterDepths[cluster] * ray.ratioLenToZ);
            float tMax = t1;

            if (cluster == 0 && (clusterIndices[0] != clusterIndices[1]))
            {
                tMax = min(t1, clusterDepths[1] * ray.ratioLenToZ);
            }
    #else
            float tMin = t0;
            float tMax = t1;
    #endif // LIGHTLOOP_TILE_PASS

            if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL)
            {
                uint punctualLightCount;

            #ifdef LIGHTLOOP_TILE_PASS
                uint punctualLightStart;
                GetCountAndStartCluster(tileCoord, clusterIndices[cluster], LIGHTCATEGORY_PUNCTUAL,
                                        punctualLightStart, punctualLightCount);
            #else
                punctualLightCount = _PunctualLightCount;
            #endif // LIGHTLOOP_TILE_PASS

                // TODO: since lights are sorted, make a while loop per light type.
                for (uint i = 0; i < punctualLightCount; ++i)
                {
                #ifdef LIGHTLOOP_TILE_PASS
                    uint punctualLightIndex = FetchIndex(punctualLightStart, i);
                #else
                    uint punctualLightIndex = i;
                #endif // LIGHTLOOP_TILE_PASS

                    // Fetch the light.
                    LightData lightData = _LightDatas[punctualLightIndex];
                    int       lightType = lightData.lightType;

                    // TODO...
                    if (lightType != GPULIGHTTYPE_POINT) continue;

                    float t, rcpPdf;
                    ImportanceSamplePunctualLight(rndVal, lightData.positionWS,
                                                  ray.originWS, ray.directionWS,
                                                  tMin, tMax, t, rcpPdf);

                    float3 positionWS = GetPointAtDistance(ray, t);

                    // TODO: we could compute this data in ImportanceSamplePunctualLight().
                    float3 lightToSample = positionWS - lightData.positionWS;
                    float  distSq        = dot(lightToSample, lightToSample);
                    float  dist          = sqrt(distSq);
                    float3 L             = lightToSample * -rsqrt(distSq);
                    float  intensity     = GetPunctualShapeAttenuation(lightData, L, distSq);
                    float3 color         = lightData.color;

                    // TODO: heterogeneous medium.
                    intensity *= TransmittanceHomogeneousMedium(extinction, dist);

                    [branch] if (lightData.shadowIndex >= 0)
                    {
                        // TODO: make projector lights cast shadows.
                        float shadow = GetPunctualShadowAttenuation(context.shadowContext, positionWS,
                                       float3(0, 0, 0), lightData.shadowIndex, float4(L, dist));

                        intensity *= lerp(1, shadow, lightData.shadowDimmer);
                    }

                    // Projector lights always have cookies, so we can perform clipping inside the if().
                    [branch] if (lightData.cookieIndex >= 0)
                    {
                        float4 cookie = EvaluateCookie_Punctual(context, lightData, lightToSample);

                        color     *= cookie.rgb;
                        intensity *= cookie.a;
                    }

                    // Compute transmittance from 't0' to 't'.
                    float transmittance = TransmittanceHomogeneousMedium(extinction, t - t0);

                    intensity *= transmittance * rcpPdf;

                    // Compute the amount of in-scattered radiance.
                    sampleRadiance += color * intensity;
                }
            }
    #ifdef LIGHTLOOP_TILE_PASS
            // The voxel is completely inside the light cluster.
            if (clusterIndices[0] == clusterIndices[1]) break;
        }
    #endif // LIGHTLOOP_TILE_PASS

        // Compute the transmittance up to the start of the interval.
        float transmittance = Transmittance(opticalDepth);

        // Integral{a, b}{Transmittance(0, t) * Li(t) dt} = Transmittance(0, a) * Integral{a, b}{Transmittance(0, t - a) * Li(t) dt}.
        totalRadiance += (transmittance * IsotropicPhaseFunction()) * scattering * sampleRadiance;

        // Compute the optical depth up to the center of the interval.
        opticalDepth += 0.5 * extinction * dt;

        // Store the voxel data. TODO: reprojection of 'tc' (or 'centerWS').
        _VBufferLighting[uint3(voxelCoord, slice)] = float4(totalRadiance, opticalDepth);

        // Compute the optical depth up to the end of the interval.
        opticalDepth += 0.5 * extinction * dt;

        t0 = t1;

    #ifdef LIGHTLOOP_TILE_PASS
        clusterIndices[0] = clusterIndices[1];
        clusterDepths[0]  = clusterDepths[1];
    #endif // LIGHTLOOP_TILE_PASS
    }
}

[numthreads(GROUP_SIZE_2D, 1, 1)]
void VolumetricLighting(uint2 groupId       : SV_GroupID,
                        uint  groupThreadId : SV_GroupThreadID)
{
    // Perform compile-time checks.
    if (!IsPower2(VBUFFER_TILE_SIZE) || !IsPower2(TILE_SIZE_CLUSTERED)) return;

    // Note: any factor of 64 is a suitable wave size for our algorithm.
    uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
    uint laneIndex = groupThreadId % 64;
    uint quadIndex = laneIndex / 4;

    // Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.
    uint  mortonCode = groupThreadId;
    uint2 localCoord = DecodeMorton2D(mortonCode);
    uint2 groupCoord = groupId * GROUP_SIZE_1D;
    uint2 voxelCoord = groupCoord + localCoord;
    uint2 tileCoord  = voxelCoord * VBUFFER_TILE_SIZE / TILE_SIZE_CLUSTERED;

    uint voxelsPerClusterTile = Sq(TILE_SIZE_CLUSTERED / VBUFFER_TILE_SIZE);

    if (voxelsPerClusterTile >= 64)
    {
        // TODO: this is a compile-time test, make sure the compiler actually scalarizes.
        tileCoord = WaveReadFirstLane(tileCoord);
    }

    [branch] if (voxelCoord.x >= (uint)_VBufferResolutionAndScale.x ||
                 voxelCoord.y >= (uint)_VBufferResolutionAndScale.y)
    {
        return;
    }

    // TODO: use a low-discrepancy point set.
    float2 sampleCoord = voxelCoord + 0.5;

    // Compute the ray direction s.t. its ViewSpaceZ = 1.
    float3 dir = -mul(float3(sampleCoord, 1), (float3x3)_VBufferCoordToViewDirWS);

    Ray ray;
    ray.originWS    = GetCurrentViewPosition();
    ray.ratioLenToZ = length(dir);
    ray.directionWS = normalize(dir);

    FillVolumetricLightingBuffer(ray, voxelCoord, tileCoord);
}