ScriptableRenderPipeline/Assets/ScriptableRenderLoop/fptl/lightlistbuild.compute


								// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.

								// https://github.com/wolfgangfengel/GPU-Pro-7


								#pragma kernel TileLightListGen


								#include "..\common\ShaderBase.h"

								#include "LightDefinitions.cs.hlsl"


								#define FINE_PRUNING_ENABLED

								#define PERFORM_SPHERICAL_INTERSECTION_TESTS


								uniform int g_iNrVisibLights;

								uniform uint2 g_viDimensions;

								uniform float4x4 g_mInvScrProjection;

								uniform float4x4 g_mScrProjection;


								Texture2D g_depth_tex : register( t0 );

								StructuredBuffer<float3> g_vBoundsBuffer : register( t1 );

								StructuredBuffer<SFiniteLightData> g_vLightData : register( t2 );

								StructuredBuffer<SFiniteLightBound> g_data : register( t3 );


								#define NR_THREADS			64


								// output buffer

								//RWBuffer<uint4> g_vLightList : register( u0 );

								RWStructuredBuffer<uint> g_vLightList : register( u0 );


								#define MAX_NR_COARSE_ENTRIES		64

								#define MAX_NR_PRUNED_ENTRIES		24


								groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];

								groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES];		// temporarily support room for all 64 while in LDS


								groupshared uint ldsZMin;

								groupshared uint ldsZMax;

								groupshared uint lightOffs;

								#ifdef FINE_PRUNING_ENABLED

								groupshared uint ldsDoesLightIntersect[2];

								#endif

								groupshared int ldsNrLightsFinal;


								groupshared int ldsModelListCount[NR_LIGHT_MODELS];		// since NR_LIGHT_MODELS is 2


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								groupshared uint lightOffsSph;

								#endif


								//float GetLinearDepth(float3 vP)

								//{

								//	float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));

								//	return v4Pres.z / v4Pres.w;

								//}


								float GetLinearDepth(float zDptBufSpace)	// 0 is near 1 is far

								{

									float3 vP = float3(0.0f,0.0f,zDptBufSpace);

									float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));

									return v4Pres.z / v4Pres.w;

								}


								float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)

								{

									float fSx = g_mScrProjection[0].x;

									float fCx = g_mScrProjection[0].z;

									float fSy = g_mScrProjection[1].y;

									float fCy = g_mScrProjection[1].z;


								#ifdef LEFT_HAND_COORDINATES

									return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 );

								#else

									return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 );

								#endif

								}


								float GetOnePixDiagWorldDistAtDepthOne()

								{

									float fSx = g_mScrProjection[0].x;

									float fSy = g_mScrProjection[1].y;


									return length( float2(1.0/fSx,1.0/fSy) );

								}


								void sortLightList(int localThreadID, int n);


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);

								#endif


								[numthreads(NR_THREADS, 1, 1)]

								void TileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)

								{

									uint2 tileIDX = u3GroupID.xy;

									uint t=threadID;


									if(t<MAX_NR_COARSE_ENTRIES)

										prunedList[t]=0;


									uint iWidth = g_viDimensions.x;

									uint iHeight = g_viDimensions.y;

									uint nrTilesX = (iWidth+15)/16;

									uint nrTilesY = (iHeight+15)/16;


									// build tile scr boundary

									const uint uFltMax = 0x7f7fffff;  // FLT_MAX as a uint

									if(t==0)

									{

										ldsZMin = uFltMax;

										ldsZMax = 0;

										lightOffs = 0;

									}


								#if !defined(XBONE) && !defined(PLAYSTATION4)

									GroupMemoryBarrierWithGroupSync();

								#endif


									uint2 viTilLL = 16*tileIDX;


									// establish min and max depth first

									float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;


									float4 vLinDepths;

									{

										// Fetch depths and calculate min/max

										[unroll]

										for(int i = 0; i < 4; i++)

										{

											int idx = i * NR_THREADS + t;

											uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );

											const float fDepth = FetchDepth(g_depth_tex, uCrd);

											vLinDepths[i] = GetLinearDepth(fDepth);

											if(fDepth<VIEWPORT_SCALE_Z)		// if not skydome

											{

												dpt_mi = min(fDepth, dpt_mi);

												dpt_ma = max(fDepth, dpt_ma);

											}

										}


										InterlockedMax(ldsZMax, asuint(dpt_ma));

										InterlockedMin(ldsZMin, asuint(dpt_mi));


								#if !defined(XBONE) && !defined(PLAYSTATION4)

										GroupMemoryBarrierWithGroupSync();

								#endif

									}


									float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));

									float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));

									vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;


									// build coarse list using AABB

									for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)

									{

										const float3 vMi = g_vBoundsBuffer[l];

										const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights];


										if( all(vMa>vTileLL) && all(vMi<vTileUR))

										{

											unsigned int uInc = 1;

											unsigned int uIndex;

											InterlockedAdd(lightOffs, uInc, uIndex);

											if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l;		// add to light list

										}

									}


								#ifdef FINE_PRUNING_ENABLED

									if(t<2) ldsDoesLightIntersect[t] = 0;

								#endif


								#if !defined(XBONE) && !defined(PLAYSTATION4)

									GroupMemoryBarrierWithGroupSync();

								#endif


									int iNrCoarseLights = lightOffs<MAX_NR_COARSE_ENTRIES ? lightOffs : MAX_NR_COARSE_ENTRIES;


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

									iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );

								#endif


								#ifndef FINE_PRUNING_ENABLED

									{

										int iNrLightsOut = iNrCoarseLights<MAX_NR_PRUNED_ENTRIES ? iNrCoarseLights : MAX_NR_PRUNED_ENTRIES;

										if((int)t<iNrLightsOut) prunedList[t] = coarseList[t];

										if(t==0) ldsNrLightsFinal=iNrLightsOut;

									}

								#else

									{

										uint uLightsFlags[2] = {0,0};

										int l=0;

										// need this outer loop even on xb1 and ps4 since direct lights and

										// reflection lights are kept in separate regions.

										while(l<iNrCoarseLights)

										{

											// fetch light

											int idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;

											uint uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;


											// spot

											while(l<iNrCoarseLights && uLgtType==SPOT_LIGHT)

											{

												SFiniteLightData lightData = g_vLightData[idxCoarse];

												const bool bIsSpotDisc = (lightData.flags&IS_CIRCULAR_SPOT_SHAPE)!=0;


												// serially check 4 pixels

												uint uVal = 0;

												for(int i=0; i<4; i++)

												{

													int idx = t + i*NR_THREADS;


													uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

													float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


													// check pixel

													float3 fromLight = vVPos-lightData.lightPos.xyz;

													float distSq = dot(fromLight,fromLight);

													const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz);		// spotDir = lightData.lightAxisZ.xyz


													float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );


													float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);

													if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1;

												}


												uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));

												++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;

												uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;

											}


											// sphere

											while(l<iNrCoarseLights && uLgtType==SPHERE_LIGHT)

											{

												SFiniteLightData lightData = g_vLightData[idxCoarse];


												// serially check 4 pixels

												uint uVal = 0;

												for(int i=0; i<4; i++)

												{

													int idx = t + i*NR_THREADS;


													uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

													float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


													// check pixel

													float3 vLp = lightData.lightPos.xyz;

													float3 toLight = vLp - vVPos;

													float distSq = dot(toLight,toLight);


													if(lightData.radiusSq>distSq) uVal = 1;

												}


												uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));

												++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;

												uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;

											}


											// Box

											while(l<iNrCoarseLights && uLgtType==BOX_LIGHT)

											{

												SFiniteLightData lightData = g_vLightData[idxCoarse];


												// serially check 4 pixels

												uint uVal = 0;

												for(int i=0; i<4; i++)

												{

													int idx = t + i*NR_THREADS;


													uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));

													float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);


													// check pixel

													float3 toLight  = lightData.lightPos.xyz - vVPos;


													float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );

													dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange;		// not as efficient as it could be

													if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1;						// but allows us to not write out OuterDists

												}


												uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));

												++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;

												uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;

											}


											// in case we have some corrupt data make sure we terminate

											if(uLgtType>=MAX_TYPES) ++l;

										}


										InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);

										InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);

										if(t==0) ldsNrLightsFinal = 0;


								#if !defined(XBONE) && !defined(PLAYSTATION4)

										GroupMemoryBarrierWithGroupSync();

								#endif


										if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )

										{

											unsigned int uInc = 1;

											unsigned int uIndex;

											InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);

											if(uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[t];		// we allow up to 64 pruned lights while stored in LDS.

										}

									}

								#endif


									//

									if(t<NR_LIGHT_MODELS) ldsModelListCount[t]=0;


								#if !defined(XBONE) && !defined(PLAYSTATION4)

									GroupMemoryBarrierWithGroupSync();

								#endif


									int nrLightsCombinedList = ldsNrLightsFinal<MAX_NR_COARSE_ENTRIES ? ldsNrLightsFinal : MAX_NR_COARSE_ENTRIES;

									for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)

									{

										InterlockedAdd(ldsModelListCount[ g_vLightData[ prunedList[i] ].lightModel ], 1);

									}


									// sort lights

								#if !defined(XBONE) && !defined(PLAYSTATION4)

									sortLightList((int) t, nrLightsCombinedList);

								#endif


									// write lights to global buffers

									int localOffs=0;

									int offs = tileIDX.y*nrTilesX + tileIDX.x;

									for(int m=0; m<NR_LIGHT_MODELS; m++)

									{

										int nrLightsFinal = ldsModelListCount[ m ];

										int nrLightsFinalClamped = nrLightsFinal<MAX_NR_PRUNED_ENTRIES ? nrLightsFinal : MAX_NR_PRUNED_ENTRIES;


										const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;

										for(l=(int) t; l<(int) nrDWords; l += NR_THREADS)

										{

											uint uLow = l==0 ? nrLightsFinalClamped : prunedList[2*l-1+localOffs];

											uint uHigh = prunedList[2*l+0+localOffs];


											g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16);

										}


										localOffs += nrLightsFinal;

										offs += (nrTilesX*nrTilesY);

									}


								}


								// original version

								//float2 vRay2D = float2(max(V.x,V.y), fSclProj);

								//float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D);

								//if( all( float3(lightData.radiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lightData.fPenumbra, 0.0) ) ) uVal = 1;


								// previous new version

								//float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC);

								//if( all( float3(lightData.radiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1;


								#if 0

								void merge(int l, int m, int r);


								void sortLightList(int localThreadID, int n)

								{

								   for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)

								   {

										for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size))

										{

											int mid = left_start + curr_size - 1;

											int right_end = min(left_start + 2*curr_size - 1, n-1);

											merge(left_start, mid, right_end);

										}


									   GroupMemoryBarrierWithGroupSync();

								   }

								}


								//groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES];


								void merge(int l, int m, int r)

								{

								    int i, j, k;


									int ol = l;

									int or = m+1;

									int sl = m - l + 1;		// capacity is size of left list = m - l + 1;

								    int sr =  r - m;		// capacity is size of right list = r - m


									unsigned int tmpBuffer[] = coarseList;		// re use coarse list buffer as temp buffer.


									// could do this copy more efficiently before the if-statement

									// in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync()

									for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i];


								    i = 0;

								    j = 0;

								    k = l;

								    while (i < sl && j < sr)

								    {

										const uint lVal = tmpBuffer[ol+i];

										const uint rVal = tmpBuffer[or+j];

										bool pickLeft = lVal <= rVal;

										i = pickLeft ? (i+1) : i;

										j = pickLeft ? j : (j+1);

										prunedList[k] = pickLeft ? lVal : rVal;

								        k++;

								    }


								    while (i < sl)

								    {

								        prunedList[k] = tmpBuffer[ol+i];

								        i++; k++;

								    }


								    while (j < sr)

								    {

								        prunedList[k] = tmpBuffer[or+j];

								        j++; k++;

								    }

								}


								#else


								// NOTE! returns 1 when value_in==0

								unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)

								{

									unsigned int value = 1;


									while(value<value_in && (value<<1)<=maxValue)

										value<<=1;


									return value;

								}


								void sortLightList(int localThreadID, int length)

								{

									// closest pow2 integer greater than or equal to length

									const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES);			// N is 1 when length is zero but will still not enter first for-loop


									// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.

									for(int t=length+localThreadID; t<N; t+=NR_THREADS) { prunedList[t]=MAX_NR_COARSE_ENTRIES; }		// impossible index

									GroupMemoryBarrierWithGroupSync();


									for(int k=2; k<=N; k=2*k)

									{

										for(int j=k>>1; j>0; j=j>>1)

										{

											for(int i=localThreadID; i<N; i+=NR_THREADS)

											{

												int ixj=i^j;

												if((ixj)>i)

												{

													const unsigned int Avalue = prunedList[i];

													const unsigned int Bvalue = prunedList[ixj];


													const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;

													if(mustSwap)

													{

														prunedList[i]=Bvalue;

														prunedList[ixj]=Avalue;

													}

												}

											}


											GroupMemoryBarrierWithGroupSync();

										}

									}

								}


								#endif


								#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

								int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)

								{

									lightOffsSph = 0;


									// make a copy of coarseList in prunedList.

									for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)

										prunedList[l]=coarseList[l];


								#if !defined(XBONE) && !defined(PLAYSTATION4)

									GroupMemoryBarrierWithGroupSync();

								#endif


								#ifdef LEFT_HAND_COORDINATES

									float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);

								#else

									float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);

								#endif


									float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();

									float worldDistAtDepthOne = 8*onePixDiagDist;		// scale by half a tile


									int iNrVisib = 0;

									for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)

									{

										SFiniteLightBound lightData = g_data[coarseList[l]];


										const float3 center = lightData.center.xyz;

										float fRad = lightData.radius;


								#if 1

										float3 maxZdir = float3(-center.z*center.x, -center.z*center.y, center.x*center.x + center.y*center.y);		// cross(center,cross(Zaxis,center))

										float len = length(maxZdir);

										float scalarProj = len>0.0001 ? (maxZdir.z/len) : len;	// since len>=(maxZdir.z/len) we can use len as an approximate value when len<=epsilon

										float fOffs = scalarProj*fRad;

								#else

										float fOffs = fRad;		// more false positives due to larger radius but works too

								#endif


								#ifdef LEFT_HAND_COORDINATES

										fRad = fRad + (center.z+fOffs)*worldDistAtDepthOne;

								#else

										fRad = fRad + (center.z-fOffs)*worldDistAtDepthOne;

								#endif


										float a = dot(V,V);

										float CdotV = dot(center,V);

										float c = dot(center,center) - fRad*fRad;


										float fDescDivFour = CdotV*CdotV - a*c;

										if(c<0 || (fDescDivFour>0 && CdotV>0))		// if ray hit bounding sphere

										{

											unsigned int uIndex;

											InterlockedAdd(lightOffsSph, 1, uIndex);

											coarseList[uIndex]=prunedList[l];		// read from the original copy of coarseList which is backed up in prunedList

										}

									}


								#if !defined(XBONE) && !defined(PLAYSTATION4)

									GroupMemoryBarrierWithGroupSync();

								#endif


									return lightOffsSph;

								}

								#endif