// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7. // https://github.com/wolfgangfengel/GPU-Pro-7 #pragma kernel TileLightListGen LIGHTLISTGEN=TileLightListGen #pragma kernel TileLightListGen_SrcBigTile LIGHTLISTGEN=TileLightListGen_SrcBigTile USE_TWO_PASS_TILED_LIGHTING #include "..\common\ShaderBase.h" #include "LightDefinitions.cs.hlsl" #define FINE_PRUNING_ENABLED #define PERFORM_SPHERICAL_INTERSECTION_TESTS uniform int g_iNrVisibLights; uniform uint2 g_viDimensions; uniform float4x4 g_mInvScrProjection; uniform float4x4 g_mScrProjection; Texture2D g_depth_tex : register( t0 ); StructuredBuffer g_vBoundsBuffer : register( t1 ); StructuredBuffer g_vLightData : register( t2 ); StructuredBuffer g_data : register( t3 ); #ifdef USE_TWO_PASS_TILED_LIGHTING StructuredBuffer g_vBigTileLightList : register( t4 ); #endif #define NR_THREADS 64 // output buffer //RWBuffer g_vLightList : register( u0 ); RWStructuredBuffer g_vLightList : register( u0 ); #define MAX_NR_COARSE_ENTRIES 64 #define MAX_NR_PRUNED_ENTRIES 24 groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES]; groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS groupshared uint ldsZMin; groupshared uint ldsZMax; groupshared uint lightOffs; #ifdef FINE_PRUNING_ENABLED groupshared uint ldsDoesLightIntersect[2]; #endif groupshared int ldsNrLightsFinal; groupshared int ldsModelListCount[NR_LIGHT_MODELS]; // since NR_LIGHT_MODELS is 2 #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS groupshared uint lightOffsSph; #endif //float GetLinearDepth(float3 vP) //{ // float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0)); // return v4Pres.z / v4Pres.w; //} float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far { float3 vP = float3(0.0f,0.0f,zDptBufSpace); float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0)); return v4Pres.z / v4Pres.w; } float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth) { float fSx = g_mScrProjection[0].x; float fCx = g_mScrProjection[0].z; float fSy = g_mScrProjection[1].y; float fCy = g_mScrProjection[1].z; #ifdef LEFT_HAND_COORDINATES return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 ); #else return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 ); #endif } float GetOnePixDiagWorldDistAtDepthOne() { float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; return length( float2(1.0/fSx,1.0/fSy) ); } void sortLightList(int localThreadID, int n); #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate); #endif [numthreads(NR_THREADS, 1, 1)] void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint2 tileIDX = u3GroupID.xy; uint t=threadID; if(t>4)), uint2(iWidth-1, iHeight-1) ); const float fDepth = FetchDepth(g_depth_tex, uCrd); vLinDepths[i] = GetLinearDepth(fDepth); if(fDepth>2; const int bigTileIdx = (tileIDX.y>>2)*NrBigTilesX + (tileIDX.x>>2); // map the idx to 64x64 tiles int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+0]; for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS) { int l = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+l0+1]; #else for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS) { #endif const float3 vMi = g_vBoundsBuffer[l]; const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights]; if( all(vMa>vTileLL) && all(vMi>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 fromLight = vVPos-lightData.lightPos.xyz; float distSq = dot(fromLight,fromLight); const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) ); float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y); if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1; } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 vLp = lightData.lightPos.xyz; float3 toLight = vLp - vVPos; float distSq = dot(toLight,toLight); if(lightData.radiusSq>distSq) uVal = 1; } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l>4)), uint2(iWidth-1, iHeight-1)); float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]); // check pixel float3 toLight = lightData.lightPos.xyz - vVPos; float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) ); dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists } uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31)); ++l; idxCoarse = l=MAX_TYPES) ++l; } InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]); InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]); if(t==0) ldsNrLightsFinal = 0; #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 ) { unsigned int uInc = 1; unsigned int uIndex; InterlockedAdd(ldsNrLightsFinal, uInc, uIndex); if(uIndex>1; for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS) { uint uLow = l==0 ? nrLightsFinalClamped : prunedList[2*l-1+localOffs]; uint uHigh = prunedList[2*l+0+localOffs]; g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16); } localOffs += nrLightsFinal; offs += (nrTilesX*nrTilesY); } } // original version //float2 vRay2D = float2(max(V.x,V.y), fSclProj); //float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D); //if( all( float3(lightData.radiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lightData.fPenumbra, 0.0) ) ) uVal = 1; // previous new version //float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC); //if( all( float3(lightData.radiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1; #if 0 void merge(int l, int m, int r); void sortLightList(int localThreadID, int n) { for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size) { for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size)) { int mid = left_start + curr_size - 1; int right_end = min(left_start + 2*curr_size - 1, n-1); merge(left_start, mid, right_end); } GroupMemoryBarrierWithGroupSync(); } } //groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES]; void merge(int l, int m, int r) { int i, j, k; int ol = l; int or = m+1; int sl = m - l + 1; // capacity is size of left list = m - l + 1; int sr = r - m; // capacity is size of right list = r - m unsigned int tmpBuffer[] = coarseList; // re use coarse list buffer as temp buffer. // could do this copy more efficiently before the if-statement // in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync() for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i]; i = 0; j = 0; k = l; while (i < sl && j < sr) { const uint lVal = tmpBuffer[ol+i]; const uint rVal = tmpBuffer[or+j]; bool pickLeft = lVal <= rVal; i = pickLeft ? (i+1) : i; j = pickLeft ? j : (j+1); prunedList[k] = pickLeft ? lVal : rVal; k++; } while (i < sl) { prunedList[k] = tmpBuffer[ol+i]; i++; k++; } while (j < sr) { prunedList[k] = tmpBuffer[or+j]; j++; k++; } } #else // NOTE! returns 1 when value_in==0 unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue) { unsigned int value = 1; while(value>1; j>0; j=j>>1) { for(int i=localThreadID; ii) { const unsigned int Avalue = prunedList[i]; const unsigned int Bvalue = prunedList[ixj]; const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue; if(mustSwap) { prunedList[i]=Bvalue; prunedList[ixj]=Avalue; } } } GroupMemoryBarrierWithGroupSync(); } } } #endif #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate) { lightOffsSph = 0; // make a copy of coarseList in prunedList. for(int l=threadID; l0.0001 ? (maxZdir.z/len) : len; // since len>=(maxZdir.z/len) we can use len as an approximate value when len<=epsilon float fOffs = scalarProj*fRad; #else float fOffs = fRad; // more false positives due to larger radius but works too #endif #ifdef LEFT_HAND_COORDINATES fRad = fRad + (center.z+fOffs)*worldDistAtDepthOne; #else fRad = fRad - (center.z-fOffs)*worldDistAtDepthOne; #endif float a = dot(V,V); float CdotV = dot(center,V); float c = dot(center,center) - fRad*fRad; float fDescDivFour = CdotV*CdotV - a*c; if(c<0 || (fDescDivFour>0 && CdotV>0)) // if ray hit bounding sphere { unsigned int uIndex; InterlockedAdd(lightOffsSph, 1, uIndex); coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList } } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif return lightOffsSph; } #endif