// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7. // https://github.com/wolfgangfengel/GPU-Pro-7 #pragma kernel ScreenBoundsAABB #include "ShaderBase.h" #include "LightDefinitions.cs.hlsl" uniform int g_isOrthographic; uniform int g_iNrVisibLights; uniform float4x4 g_mInvProjection; uniform float4x4 g_mProjection; StructuredBuffer g_data : register( t0 ); #define FLT_EPSILON 1.192092896e-07F // smallest such that 1.0+FLT_EPSILON != 1.0 #define NR_THREADS 64 // output buffer RWStructuredBuffer g_vBoundsBuffer : register( u0 ); #define MAX_PNTS 9 // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed) // However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane // clipping gets skipped which doesn't cause any errors. // LDS (2496 bytes) groupshared float posX[MAX_PNTS*8*2]; groupshared float posY[MAX_PNTS*8*2]; groupshared float posZ[MAX_PNTS*8*2]; groupshared float posW[MAX_PNTS*8*2]; groupshared unsigned int clipFlags[48]; unsigned int GetClip(const float4 P); int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p); void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r); #include "LightingConvexHullUtils.hlsl" [numthreads(NR_THREADS, 1, 1)] void ScreenBoundsAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID) { uint groupID = u3GroupID.x; //uint vindex = groupID * NR_THREADS + threadID; unsigned int g = groupID; unsigned int t = threadID; const int subLigt = (int) (t/8); const int lgtIndex = subLigt+(int) g*8; const int sideIndex = (int) (t%8); SFiniteLightBound lgtDat = g_data[lgtIndex]; const float3 boxX = lgtDat.boxAxisX.xyz; const float3 boxY = lgtDat.boxAxisY.xyz; const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light) const float3 center = lgtDat.center.xyz; const float radius = lgtDat.radius; const float2 scaleXY = lgtDat.scaleXY; { if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights) // mask 2 out of 8 threads { float3 q0, q1, q2, q3; GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, sideIndex); const float4 vP0 = mul(g_mProjection, float4(q0, 1)); const float4 vP1 = mul(g_mProjection, float4(q1, 1)); const float4 vP2 = mul(g_mProjection, float4(q2, 1)); const float4 vP3 = mul(g_mProjection, float4(q3, 1)); // test vertices of one quad (of the convex hull) for intersection const unsigned int uFlag0 = GetClip(vP0); const unsigned int uFlag1 = GetClip(vP1); const unsigned int uFlag2 = GetClip(vP2); const unsigned int uFlag3 = GetClip(vP3); const float4 vPnts[] = {vP0, vP1, vP2, vP3}; // screen-space AABB of one quad (assuming no intersection) float3 vMin, vMax; for(int k=0; k<4; k++) { float fW = vPnts[k].w; float fS = fW<0 ? -1 : 1; float fWabs = fW<0 ? (-fW) : fW; fW = fS * (fWabs>(i*6))&0x3f; uFlagAnd &= uClipBits; uFlagOr |= uClipBits; } uCollectiveAnd &= uFlagAnd; uCollectiveOr |= uFlagOr; } bool bSetBoundYet = false; float3 vMin=0.0, vMax=0.0; if(uCollectiveAnd!=0 || uCollectiveOr==0) // all invisible or all visible (early out) { if(uCollectiveOr==0) // all visible { for(f=0; f<6; f++) { const int sideIndex = f; float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]); float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]); for(int k=0; k<2; k++) { float3 vP = k==0 ? vFaceMi : vFaceMa; if(f==0 && k==0) { vMin=vP; vMax=vP; } vMax = max(vMax, vP); vMin = min(vMin, vP); } } bSetBoundYet=true; } } else // :( need true clipping { for(f=0; f<6; f++) { float3 q0, q1, q2, q3; GetHullQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, f); // 4 vertices to a quad of the convex hull in post projection space const float4 vP0 = mul(g_mProjection, float4(q0, 1)); const float4 vP1 = mul(g_mProjection, float4(q1, 1)); const float4 vP2 = mul(g_mProjection, float4(q2, 1)); const float4 vP3 = mul(g_mProjection, float4(q3, 1)); int iSrcIndex = 0; int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; // fill up source clip buffer with the quad posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x; posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y; posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z; posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w; int iNrSrcVerts = 4; // do true clipping for(int p=0; p<6; p++) { const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p); iSrcIndex = 1-iSrcIndex; iNrSrcVerts = nrVertsDst; if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break; } // final clipped convex primitive is in src buffer if(iNrSrcVerts>2) { int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; for(int k=0; kradius) { float2 vMi, vMa; bool2 bMi, bMa; CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, center, radius); vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy; vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy; } else if(g_isOrthographic!=0) { float2 vMi = mul(g_mProjection, float4(center.xyz-radius,1)).xy; // no division needed for ortho float2 vMa = mul(g_mProjection, float4(center.xyz+radius,1)).xy; // no division needed for ortho vMin.xy = max(vMin.xy, vMi); vMax.xy = min(vMax.xy, vMa); } #if USE_LEFTHAND_CAMERASPACE if((center.z-radius)>0.0) { float4 vPosF = mul(g_mProjection, float4(0,0,center.z-radius,1)); vMin.z = max(vMin.z, vPosF.z/vPosF.w); } if((center.z+radius)>0.0) { float4 vPosB = mul(g_mProjection, float4(0,0,center.z+radius,1)); vMax.z = min(vMax.z, vPosB.z/vPosB.w); } #else if((center.z+radius)<0.0) { float4 vPosF = mul(g_mProjection, float4(0,0,center.z+radius,1)); vMin.z = max(vMin.z, vPosF.z/vPosF.w); } if((center.z-radius)<0.0) { float4 vPosB = mul(g_mProjection, float4(0,0,center.z-radius,1)); vMax.z = min(vMax.z, vPosB.z/vPosB.w); } #endif else { vMin = float3(-3,-3,-3); vMax = float3(-2,-2,-2); } } // we should consider doing a look-up here into a max depth mip chain // to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth //g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z); //g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z); // changed for unity g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, 0.5*vMin.y+0.5, vMin.z*VIEWPORT_SCALE_Z); g_vBoundsBuffer[lgtIndex+(int) g_iNrVisibLights] = float3(0.5*vMax.x+0.5, 0.5*vMax.y+0.5, vMax.z*VIEWPORT_SCALE_Z); } } } float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p); int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p) { int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2; int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2; float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]); int nrVertsDst = 0; unsigned int uMask = (1<P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0); } float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p) { const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1); const int index = ((uint) p)/2; float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z); float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z); //fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0; const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w)); float4 vNew = vVisib*fT + vInvisib*(1-fT); // just to be really anal we make sure the clipped against coordinate is precise if(index==0) vNew.x = fS*vNew.w; else if(index==1) vNew.y = fS*vNew.w; else vNew.z = fS*vNew.w; return vNew; } float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane) { return mul(plane, InvProjection); } float4 EvalPlanePair(out bool validPlanes, float2 posXY_in, float r) { // rotate by 90 degrees to avoid potential division by zero bool bMustFlip = abs(posXY_in.y)0.0; return res; } void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r) { bool validX, validY; float4 planeX = EvalPlanePair(validX, float2(pos_view_space.x, pos_view_space.z), r); float4 planeY = EvalPlanePair(validY, float2(pos_view_space.y, pos_view_space.z), r); #if USE_LEFTHAND_CAMERASPACE planeX = planeX.zwxy; // need to swap left/right and top/bottom planes when using left hand system planeY = planeY.zwxy; #endif bIsMinValid = bool2(planeX.z<0, planeY.z<0) && bool2(validX,validY); bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0) && bool2(validX,validY); // hopefully the compiler takes zeros into account // should be the case since the transformation in TransformPlaneToPostSpace() // is done using multiply-adds and not dot product instructions. float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0)); float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0)); float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0)); float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0)); // convert planes to the forms (1,0,0,D) and (0,1,0,D) // 2D bound is given by -D components float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y); float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y); // Bound is complete vMin = B; vMax = A; }