#pragma kernel TileLightListGen_NoDepthRT LIGHTLISTGEN=TileLightListGen_NoDepthRT #pragma kernel TileLightListGen_DepthRT LIGHTLISTGEN=TileLightListGen_DepthRT ENABLE_DEPTH_TEXTURE_BACKPLANE #pragma kernel TileLightListGen_DepthRT_MSAA LIGHTLISTGEN=TileLightListGen_DepthRT_MSAA ENABLE_DEPTH_TEXTURE_BACKPLANE MSAA_ENABLED #pragma kernel ClearAtomic #include "..\common\ShaderBase.h" #include "LightDefinitions.cs.hlsl" //#define EXACT_EDGE_TESTS #define PERFORM_SPHERICAL_INTERSECTION_TESTS #define CONV_HULL_TEST_ENABLED uniform int g_iNrVisibLights; uniform float4x4 g_mInvScrProjection; uniform float4x4 g_mScrProjection; uniform float g_fClustScale; uniform float g_fClustBase; uniform float g_fNearPlane; uniform float g_fFarPlane; uniform int g_iLog2NumClusters; // numClusters = (1< g_depth_tex : register( t0 ); #else Texture2D g_depth_tex : register( t0 ); #endif StructuredBuffer g_vBoundsBuffer : register( t1 ); StructuredBuffer g_vLightData : register( t2 ); StructuredBuffer g_data : register( t3 ); #define NR_THREADS 64 // output buffer RWBuffer g_vLayeredLightList : register( u0 ); RWBuffer g_LayeredOffset : register( u1 ); RWBuffer g_LayeredSingleIdxBuffer : register( u2 ); #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE RWBuffer g_fModulUserscale : register( u3 ); #endif #define MAX_NR_COARSE_ENTRIES 64 groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES]; groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES/2]; groupshared float4 lightPlanes[4*6]; groupshared uint lightOffs; #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE groupshared int ldsZMax; #endif #ifdef EXACT_EDGE_TESTS groupshared uint ldsIsLightInvisible; groupshared uint lightOffs2; #endif #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS groupshared uint lightOffsSph; #endif float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far { float3 vP = float3(0.0f,0.0f,zDptBufSpace); float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0)); return v4Pres.z / v4Pres.w; } float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth) { float fSx = g_mScrProjection[0].x; float fCx = g_mScrProjection[0].z; float fSy = g_mScrProjection[1].y; float fCy = g_mScrProjection[1].z; #ifdef LEFT_HAND_COORDINATES return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 ); #else return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 ); #endif } float GetOnePixDiagWorldDistAtDepthOne() { float fSx = g_mScrProjection[0].x; float fSy = g_mScrProjection[1].y; return length( float2(1.0/fSx,1.0/fSy) ); } void sortLightList(int localThreadID, int n); #ifdef EXACT_EDGE_TESTS int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane); #endif #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate); #endif // returns 1 for intersection and 0 for none float4 GetPlaneEq(const float3 vBoxX, const float3 vBoxY, const float3 vBoxZ, const float3 vCen, const float2 vScaleXZ, const int sideIndex); float4 FetchPlane(int l, int p); bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float fModulUserScale) { unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff; bool bIsHit = ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff); if(bIsHit) { #ifdef CONV_HULL_TEST_ENABLED float depthAtNearZ = ClusterIdxToZ(k, fModulUserScale); float depthAtFarZ = ClusterIdxToZ(k+1, fModulUserScale); for(int p=0; p<6; p++) { float4 plane = lightPlanes[6*(l&3)+p]; bool bAllInvisib = true; for(int i=0; i<8; i++) { float x = (i&1)==0 ? viTilLL.x : viTilUR.x; float y = (i&2)==0 ? viTilLL.y : viTilUR.y; float z = (i&4)==0 ? depthAtNearZ : depthAtFarZ; float3 vP = GetViewPosFromLinDepth( float2(x, y), z); bAllInvisib = bAllInvisib && dot(plane, float4(vP,1.0))>0; } if(bAllInvisib) bIsHit = false; } #endif } return bIsHit; } bool CheckIntersectionBasic(int l, int k) { unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff; return ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff); } [numthreads(NR_THREADS, 1, 1)] void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint2 tileIDX = u3GroupID.xy; uint t=threadID; uint iWidth; uint iHeight; #ifdef MSAA_ENABLED uint iNumSamplesMSAA; g_depth_tex.GetDimensions(iWidth, iHeight, iNumSamplesMSAA); #else g_depth_tex.GetDimensions(iWidth, iHeight); #endif uint nrTilesX = (iWidth+15)/16; uint nrTilesY = (iHeight+15)/16; uint2 viTilLL = 16*tileIDX; uint2 viTilUR = min( viTilLL+uint2(16,16), uint2(iWidth-1, iHeight-1) ); if(t==0) { lightOffs = 0; #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE ldsZMax = 0; #endif } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif float dpt_ma=1.0; #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE // establish min and max depth first dpt_ma=0.0; for(int idx=t; idx<256; idx+=NR_THREADS) { uint2 uPixCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) ); #ifdef MSAA_ENABLED for(int i=0; ivTileLL.xy) && all(vMi.xyzvTileLL.xy) && all(vMi.xy0.0 ? ((g_fFarPlane-g_fNearPlane)/fDenom) : 1.0; // readjust to new range. #else float fTileFarPlane = g_fFarPlane; float fModulUserScale = 1.0; #endif #ifdef EXACT_EDGE_TESTS iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane); #endif // sort lights #if !defined(XBONE) && !defined(PLAYSTATION4) sortLightList((int) t, iNrCoarseLights); #endif //////////// cell specific code { for(int l=(int) t; l<((iNrCoarseLights+1)>>1); l += NR_THREADS) { const int l0 = coarseList[2*l+0], l1 = coarseList[min(2*l+1,iNrCoarseLights)]; const unsigned int clustIdxMi0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0].z), fModulUserScale)); const unsigned int clustIdxMa0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0+g_iNrVisibLights].z), fModulUserScale)); const unsigned int clustIdxMi1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1].z), fModulUserScale)); const unsigned int clustIdxMa1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1+g_iNrVisibLights].z), fModulUserScale)); clusterIdxs[l] = (clustIdxMa1<<24) | (clustIdxMi1<<16) | (clustIdxMa0<<8) | (clustIdxMi0<<0); } } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif int nrClusters = (1<>2; int m = i&3; if(i<24) lightPlanes[6*m+p] = FetchPlane(min(iNrCoarseLights-1,ll+m), p); #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif for(int l=ll; l>1; j>0; j=j>>1) { for(int i=localThreadID; ii) { const unsigned int Avalue = coarseList[i]; const unsigned int Bvalue = coarseList[ixj]; const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue; if(mustSwap) { coarseList[i]=Bvalue; coarseList[ixj]=Avalue; } } } GroupMemoryBarrierWithGroupSync(); } } } float4 GetPlaneEq(const float3 vBoxX, const float3 vBoxY, const float3 vBoxZ, const float3 vCen, const float2 vScaleXY, const int sideIndex) { const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2); const float fS = (sideIndex & 1) != 0 ? 1 : (-1); float3 vA = fS*(iAbsSide == 0 ? vBoxX : (iAbsSide == 1 ? (-vBoxY) : vBoxZ)); float3 vB = fS*(iAbsSide == 0 ? (-vBoxY) : (iAbsSide == 1 ? (-vBoxX) : (-vBoxY))); float3 vC = iAbsSide == 0 ? vBoxZ : (iAbsSide == 1 ? vBoxZ : (-vBoxX)); bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0; // in this case all 4 verts get scaled. bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1); // if side quad only two verts get scaled (impacts q1 and q2) if (bIsTopQuad) { vB *= vScaleXY.y; vC *= vScaleXY.x; } float3 vA2 = vA; float3 vB2 = vB; if (bIsSideQuad) { vA2 *= (iAbsSide == 0 ? vScaleXY.x : vScaleXY.y); vB2 *= (iAbsSide == 0 ? vScaleXY.y : vScaleXY.x); } float3 p0 = vCen + (vA + vB - vC); // vCen + vA is center of face when vScaleXY is 1.0 float3 vN = cross( vB2, 0.5*(vA-vA2) - vC ); #ifdef LEFT_HAND_COORDINATES vN = -vN; #endif return float4(vN, -dot(vN,p0)); } float4 FetchPlane(int l, int p) { SFiniteLightBound lgtDat = g_data[coarseList[l]]; const float3 vBoxX = lgtDat.vBoxAxisX.xyz; const float3 vBoxY = lgtDat.vBoxAxisY.xyz; const float3 vBoxZ = -lgtDat.vBoxAxisZ.xyz; // flip an axis to make it right handed since Determinant(worldToView)<0 const float3 vCen = lgtDat.vCen.xyz; const float fRadius = lgtDat.fRadius; const float2 vScaleXY = lgtDat.vScaleXY; return GetPlaneEq(vBoxX, vBoxY, vBoxZ, vCen, vScaleXY, p); } #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate) { #ifdef LEFT_HAND_COORDINATES float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0); #else float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0); #endif float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(); float worldDistAtDepthOne = 8*onePixDiagDist; // scale by half a tile int iNrVisib = 0; for(int l=threadID; l0.0001 ? (maxZdir.z/len) : len; // since len>=(maxZdir.z/len) we can use len as an approximate value when len<=epsilon float fOffs = scalarProj*fRad; #else float fOffs = fRad; // more false positives due to larger radius but works too #endif #ifdef LEFT_HAND_COORDINATES fRad = fRad + (vCen.z+fOffs)*worldDistAtDepthOne; #else fRad = fRad + (vCen.z-fOffs)*worldDistAtDepthOne; #endif float a = dot(V,V); float CdotV = dot(vCen,V); float c = dot(vCen,vCen) - fRad*fRad; float fDescDivFour = CdotV*CdotV - a*c; if(!(c<0 || (fDescDivFour>0 && CdotV>0))) // if ray hit bounding sphere coarseList[l]=0xffffffff; } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif // to greedy to double buffer coarseList lds on this so serializing removal of gaps. if(threadID==0) { int offs = 0; for(int l=0; l>2; int iSwizzle = e0&0x3; bool bIsSwizzleOneOrTwo = ((iSwizzle-1)&0x2)==0; const int i0 = iAxis==0 ? (2*iSwizzle+0) : ( iAxis==1 ? (iSwizzle+(iSwizzle&2)) : iSwizzle); const int i1 = i0 + (1<>2; // section 0 is side edges, section 1 is near edges and section 2 is far edges int iSwizzle = e0&0x3; int i=iSwizzle + (2*(iSection&0x2)); // offset by 4 at section 2 vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane); vE0 = iSection==0 ? vP0 : (((iSwizzle&0x2)==0 ? 1.0f : (-1.0f))*((iSwizzle&0x1)==(iSwizzle>>1) ? Vec3(1,0,0) : Vec3(0,1,0))); } int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane) { if(threadID==0) lightOffs2 = 0; const bool bOnlyNeedFrustumSideEdges = true; const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull. const int totNrEdgePairs = 12*nrFrustEdges; for(int l=0; l0) ++positive; else if(fSignDist<0) ++negative; } int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0)); positive=0; negative=0; for(int j=0; j<8; j++) { float3 vPf = GetTileVertex(viTilLL, viTilUR, j, fTileFarPlane); float fSignDist = dot(vN, vPf-vP0); if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative; } int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0)); bool bFoundSepPlane = (resh*resf)<0; if(bFoundSepPlane) InterlockedOr(ldsIsLightInvisible, 1); } } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif if(threadID==0 && ldsIsLightInvisible==0) { coarseList[lightOffs2++] = coarseList[l]; } } #if !defined(XBONE) && !defined(PLAYSTATION4) GroupMemoryBarrierWithGroupSync(); #endif return lightOffs2; } #endif [numthreads(1, 1, 1)] void ClearAtomic(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { g_LayeredSingleIdxBuffer[0]=0; }