#pragma kernel TileLightListGen #include "..\common\ShaderBase.h" #include "LightDefinitions.cs" #define FINE_PRUNING_ENABLED uniform int g_iNrVisibLights; uniform float4x4 g_mInvScrProjection; uniform float4x4 g_mScrProjection; Texture2D g_depth_tex : register( t0 ); StructuredBuffer g_vBoundsBuffer : register( t1 ); StructuredBuffer g_vLightData : register( t2 ); #define NR_THREADS 64 // output buffer //RWBuffer g_vLightList : register( u0 ); RWStructuredBuffer g_vLightList : register( u0 ); #define MAX_NR_COARSE_ENTRIES 64 #define MAX_NR_PRUNED_ENTRIES 24 groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES]; groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS groupshared uint ldsZMin; groupshared uint ldsZMax; groupshared uint lightOffs; #ifdef FINE_PRUNING_ENABLED groupshared uint ldsDoesLightIntersect[2]; #endif groupshared int ldsNrLightsFinal; groupshared int ldsModelListCount[2]; // since NR_LIGHT_MODELS is 2 //float GetLinearDepth(float3 vP) //{ // float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0)); // return v4Pres.z / v4Pres.w; //} float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far { float3 vP = float3(0.0f,0.0f,zDptBufSpace); float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0)); return v4Pres.z / v4Pres.w; } float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth) { float fSx = g_mScrProjection[0].x; float fCx = g_mScrProjection[0].z; float fSy = g_mScrProjection[1].y; float fCy = g_mScrProjection[1].z; #ifdef LEFT_HAND_COORDINATES return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 ); #else return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 ); #endif } void sortLightList(int localThreadID, int n); [numthreads(NR_THREADS, 1, 1)] void TileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint2 tileIDX = u3GroupID.xy; uint t=threadID; if(t>4)), uint2(iWidth-1, iHeight-1) ); const float fDpth = FetchDepth(g_depth_tex, uCrd); if(fDpthvTileLL) && all(vMi>4)), uint2(iWidth-1, iHeight-1) ); float3 v3ScrPos = float3(uCrd.x+0.5, uCrd.y+0.5, FetchDepth(g_depth_tex, uCrd)); vLinDepths[i] = GetLinearDepth(v3ScrPos.z); } uint uLightsFlags[2] = {0,0}; int l=0; // we need this outer loop for when we cannot assume a wavefront is 64 wide // since in this case we cannot assume the lights will remain sorted by type #if !defined(XBONE) && !defined(PLAYSTATION4) while(l>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 fromLight = vVPos-lgtDat.vLpos.xyz; float distSq = dot(fromLight,fromLight); const float fSclProj = dot(fromLight, lgtDat.vLaxisZ.xyz); // spotDir = lgtDat.vLaxisZ.xyz float2 V = abs( float2( dot(fromLight, lgtDat.vLaxisX.xyz), dot(fromLight, lgtDat.vLaxisY.xyz) ) ); float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y); if( all( float2(lgtDat.fSphRadiusSq, fSclProj) > float2(distSq, fDist2D*lgtDat.cotan) ) ) uVal = 1; } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 vLp = lgtDat.vLpos.xyz; float3 toLight = vLp - vVPos; float distSq = dot(toLight,toLight); if(lgtDat.fSphRadiusSq>distSq) uVal = 1; } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 toLight = lgtDat.vLpos.xyz - vVPos; float3 dist = float3( dot(toLight, lgtDat.vLaxisX), dot(toLight, lgtDat.vLaxisY), dot(toLight, lgtDat.vLaxisZ) ); dist = (abs(dist) - lgtDat.vBoxInnerDist) * lgtDat.vBoxInvRange; // not as efficient as it could be if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l=MAX_TYPES) ++l; #endif } InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]); InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]); if(t==0) ldsNrLightsFinal = 0; #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 ) { unsigned int uInc = 1; unsigned int uIndex; InterlockedAdd(ldsNrLightsFinal, uInc, uIndex); if(uIndex>1; for(l=(int) t; l<(int) nrDWords; l += NR_THREADS) { uint uLow = l==0 ? nrLightsFinalClamped : prunedList[2*l-1+localOffs]; uint uHigh = prunedList[2*l+0+localOffs]; g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16); } localOffs += nrLightsFinal; offs += (nrTilesX*nrTilesY); } } // original version //float2 vRay2D = float2(max(V.x,V.y), fSclProj); //float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D); //if( all( float3(lgtDat.fSphRadiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lgtDat.fPenumbra, 0.0) ) ) uVal = 1; // previous new version //float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC); //if( all( float3(lgtDat.fSphRadiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1; #if 0 void merge(int l, int m, int r); void sortLightList(int localThreadID, int n) { for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size) { for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size)) { int mid = left_start + curr_size - 1; int right_end = min(left_start + 2*curr_size - 1, n-1); merge(left_start, mid, right_end); } GroupMemoryBarrierWithGroupSync(); } } //groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES]; void merge(int l, int m, int r) { int i, j, k; int ol = l; int or = m+1; int sl = m - l + 1; // capacity is size of left list = m - l + 1; int sr = r - m; // capacity is size of right list = r - m unsigned int tmpBuffer[] = coarseList; // re use coarse list buffer as temp buffer. // could do this copy more efficiently before the if-statement // in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync() for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i]; i = 0; j = 0; k = l; while (i < sl && j < sr) { const uint lVal = tmpBuffer[ol+i]; const uint rVal = tmpBuffer[or+j]; bool pickLeft = lVal <= rVal; i = pickLeft ? (i+1) : i; j = pickLeft ? j : (j+1); prunedList[k] = pickLeft ? lVal : rVal; k++; } while (i < sl) { prunedList[k] = tmpBuffer[ol+i]; i++; k++; } while (j < sr) { prunedList[k] = tmpBuffer[or+j]; j++; k++; } } #else // NOTE! returns 1 when value_in==0 unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue) { unsigned int value = 1; while(value>1; j>0; j=j>>1) { for(int i=localThreadID; ii) { const unsigned int Avalue = prunedList[i]; const unsigned int Bvalue = prunedList[ixj]; const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue; if(mustSwap) { prunedList[i]=Bvalue; prunedList[ixj]=Avalue; } } } GroupMemoryBarrierWithGroupSync(); } } } #endif