ScriptableRenderPipeline/TestbedPipelines/Fptl/scrbound.compute


								// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.

								// https://github.com/wolfgangfengel/GPU-Pro-7


								#pragma kernel ScreenBoundsAABB


								#include "ShaderBase.h"

								#include "LightDefinitions.cs.hlsl"


								uniform int g_isOrthographic;

								uniform int g_iNrVisibLights;

								uniform float4x4 g_mInvProjection;

								uniform float4x4 g_mProjection;


								StructuredBuffer<SFiniteLightBound> g_data : register( t0 );


								#define FLT_EPSILON     1.192092896e-07F        // smallest such that 1.0+FLT_EPSILON != 1.0

								#define NR_THREADS			64


								// output buffer

								RWStructuredBuffer<float3> g_vBoundsBuffer : register( u0 );


								#define MAX_PNTS		9		// strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)

																// However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane

																// clipping gets skipped which doesn't cause any errors.


								// LDS (2496 bytes)

								groupshared float posX[MAX_PNTS*8*2];

								groupshared float posY[MAX_PNTS*8*2];

								groupshared float posZ[MAX_PNTS*8*2];

								groupshared float posW[MAX_PNTS*8*2];

								groupshared unsigned int clipFlags[48];


								unsigned int GetClip(const float4 P);

								int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p);

								void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r);


								#include "LightingConvexHullUtils.hlsl"


								[numthreads(NR_THREADS, 1, 1)]

								void ScreenBoundsAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)

								{

									uint groupID = u3GroupID.x;


									//uint vindex = groupID * NR_THREADS + threadID;

									unsigned int g = groupID;

									unsigned int t = threadID;


									const int subLigt = (int) (t/8);

									const int lgtIndex = subLigt+(int) g*8;

									const int sideIndex = (int) (t%8);


									SFiniteLightBound lgtDat = g_data[lgtIndex];


									const float3 boxX = lgtDat.boxAxisX.xyz;

									const float3 boxY = lgtDat.boxAxisY.xyz;

									const float3 boxZ = -lgtDat.boxAxisZ.xyz;           // flip axis (so it points away from the light direction for a spot-light)

									const float3 center = lgtDat.center.xyz;

									const float radius = lgtDat.radius;

									const float2 scaleXY = lgtDat.scaleXY;


									{

										if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights)		// mask 2 out of 8 threads

										{

											float3 q0, q1, q2, q3;

											GetQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, sideIndex);


											const float4 vP0 = mul(g_mProjection, float4(q0, 1));

											const float4 vP1 = mul(g_mProjection, float4(q1, 1));

											const float4 vP2 = mul(g_mProjection, float4(q2, 1));

											const float4 vP3 = mul(g_mProjection, float4(q3, 1));


											// test vertices of one quad (of the convex hull) for intersection

											const unsigned int uFlag0 = GetClip(vP0);

											const unsigned int uFlag1 = GetClip(vP1);

											const unsigned int uFlag2 = GetClip(vP2);

											const unsigned int uFlag3 = GetClip(vP3);


											const float4 vPnts[] = {vP0, vP1, vP2, vP3};


											// screen-space AABB of one quad (assuming no intersection)

											float3 vMin, vMax;

											for(int k=0; k<4; k++)

											{

												float fW = vPnts[k].w;

												float fS = fW<0 ? -1 : 1;

												float fWabs = fW<0 ? (-fW) : fW;

												fW = fS * (fWabs<FLT_EPSILON ? FLT_EPSILON : fWabs);

												float3 vP = float3(vPnts[k].x/fW, vPnts[k].y/fW, vPnts[k].z/fW);

												if(k==0) { vMin=vP; vMax=vP; }


												vMax = max(vMax, vP); vMin = min(vMin, vP);

											}


											clipFlags[subLigt*6+sideIndex] = (uFlag0<<0) | (uFlag1<<6) | (uFlag2<<12) | (uFlag3<<18);


											// store in clip buffer (only use these vMin and vMax if light is 100% visible in which case clipping isn't needed)

											posX[subLigt*MAX_PNTS*2 + sideIndex] = vMin.x;

											posY[subLigt*MAX_PNTS*2 + sideIndex] = vMin.y;

											posZ[subLigt*MAX_PNTS*2 + sideIndex] = vMin.z;


											posX[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.x;

											posY[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.y;

											posZ[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.z;

										}

									}


									// if not XBONE and not PLAYSTATION4 we need a memorybarrier here

									// since we can't rely on the gpu cores being 64 wide.

									// We need a pound define around this.

									GroupMemoryBarrierWithGroupSync();


									{

										int f=0;


										if(sideIndex==0 && lgtIndex<(int) g_iNrVisibLights)

										{

											// quick acceptance or rejection

											unsigned int uCollectiveAnd = (unsigned int) -1;

											unsigned int uCollectiveOr = 0;

											for(f=0; f<6; f++)

											{

												unsigned int uFlagAnd = clipFlags[subLigt*6+f]&0x3f;

												unsigned int uFlagOr = uFlagAnd;

												for(int i=1; i<4; i++)

												{

													unsigned int uClipBits = (clipFlags[subLigt*6+f]>>(i*6))&0x3f;

													uFlagAnd &= uClipBits;

													uFlagOr |= uClipBits;

												}


												uCollectiveAnd &= uFlagAnd;

												uCollectiveOr |= uFlagOr;

											}


											bool bSetBoundYet = false;

											float3 vMin=0.0, vMax=0.0;

											if(uCollectiveAnd!=0 || uCollectiveOr==0)		// all invisible or all visible (early out)

											{

												if(uCollectiveOr==0)	// all visible

												{

													for(f=0; f<6; f++)

													{

														const int sideIndex = f;


														float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]);

														float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]);


														for(int k=0; k<2; k++)

														{

															float3 vP = k==0 ? vFaceMi : vFaceMa;

															if(f==0 && k==0) { vMin=vP; vMax=vP; }


															vMax = max(vMax, vP); vMin = min(vMin, vP);

														}

													}

													bSetBoundYet=true;

												}

											}

											else		// :( need true clipping

											{


												for(f=0; f<6; f++)

												{

													float3 q0, q1, q2, q3;

													GetQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, f);


													// 4 vertices to a quad of the convex hull in post projection space

													const float4 vP0 = mul(g_mProjection, float4(q0, 1));

													const float4 vP1 = mul(g_mProjection, float4(q1, 1));

													const float4 vP2 = mul(g_mProjection, float4(q2, 1));

													const float4 vP3 = mul(g_mProjection, float4(q3, 1));


													int iSrcIndex = 0;


													int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;


													// fill up source clip buffer with the quad

													posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x;

													posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y;

													posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z;

													posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w;


													int iNrSrcVerts = 4;


													// do true clipping

													for(int p=0; p<6; p++)

													{

														const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p);


														iSrcIndex = 1-iSrcIndex;

														iNrSrcVerts = nrVertsDst;


														if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break;

													}


													// final clipped convex primitive is in src buffer

													if(iNrSrcVerts>2)

													{

														int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;

														for(int k=0; k<iNrSrcVerts; k++)

														{

															float4 vCur = float4(posX[offs_src+k], posY[offs_src+k], posZ[offs_src+k], posW[offs_src+k]);


															// project and apply toward AABB

															float3 vP = float3(vCur.x/vCur.w, vCur.y/vCur.w, vCur.z/vCur.w);

															if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }


															vMax = max(vMax, vP); vMin = min(vMin, vP);

														}

													}


												}


												////////////////////// look for camera frustum verts that need to be included. That is frustum vertices inside the convex hull for the light

												int i=0;

												for(i=0; i<8; i++)	// establish 8 camera frustum vertices

												{

													float3 vVertPSpace = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);


													float4 v4ViewSpace = mul(g_mInvProjection, float4(vVertPSpace,1));

													float3 vViewSpace = float3(v4ViewSpace.x/v4ViewSpace.w, v4ViewSpace.y/v4ViewSpace.w, v4ViewSpace.z/v4ViewSpace.w);


													posX[subLigt*MAX_PNTS*2 + i] = vViewSpace.x;

													posY[subLigt*MAX_PNTS*2 + i] = vViewSpace.y;

													posZ[subLigt*MAX_PNTS*2 + i] = vViewSpace.z;

												}


												// determine which camera frustum vertices are inside the convex hull

												uint uVisibFl = 0xff;

												for(f=0; f<6; f++)

												{

													float3 vP0, vN;

													GetPlane(vP0, vN, boxX, boxY, boxZ, center, scaleXY, f);


													for(i=0; i<8; i++)

													{

														float3 vViewSpace = float3(posX[subLigt*MAX_PNTS*2 + i], posY[subLigt*MAX_PNTS*2 + i], posZ[subLigt*MAX_PNTS*2 + i]);

														uVisibFl &= ( dot(vViewSpace-vP0, vN)<0 ? 0xff : (~(1<<i)) );

													}

												}


												// apply camera frustum vertices inside the convex hull to the AABB

												for(i=0; i<8; i++)

												{

													if((uVisibFl&(1<<i))!=0)

													{

														float3 vP = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);


														if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }


														vMax = max(vMax, vP); vMin = min(vMin, vP);

													}

												}

											}


											// determine AABB bound in [-1;1]x[-1;1] screen space using bounding sphere.

											// Use the result to make our already established AABB from the convex hull

											// potentially tighter.

											if(!bSetBoundYet)

											{

												// set the AABB off-screen

												vMin = float3(-3,-3,-3);

												vMax = float3(-2,-2,-2);

											}

											else

											{

												if(g_isOrthographic==0 && length(center)>radius)

												{

													float2 vMi, vMa;

													bool2 bMi, bMa;

													CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, center, radius);


													vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy;

													vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy;

												}

												else if(g_isOrthographic!=0)

												{

													float2 vMi = mul(g_mProjection, float4(center.xyz-radius,1)).xy;	 // no division needed for ortho

													float2 vMa = mul(g_mProjection, float4(center.xyz+radius,1)).xy;	 // no division needed for ortho

													vMin.xy = max(vMin.xy, vMi);

													vMax.xy = min(vMax.xy, vMa);

												}


								#if USE_LEFTHAND_CAMERASPACE

												if((center.z-radius)>0.0)

												{

													float4 vPosF = mul(g_mProjection, float4(0,0,center.z-radius,1));

													vMin.z = max(vMin.z, vPosF.z/vPosF.w);

												}

												if((center.z+radius)>0.0)

												{

													float4 vPosB = mul(g_mProjection, float4(0,0,center.z+radius,1));

													vMax.z = min(vMax.z, vPosB.z/vPosB.w);

												}

								#else

												if((center.z+radius)<0.0)

												{

													float4 vPosF = mul(g_mProjection, float4(0,0,center.z+radius,1));

													vMin.z = max(vMin.z, vPosF.z/vPosF.w);

												}

												if((center.z-radius)<0.0)

												{

													float4 vPosB = mul(g_mProjection, float4(0,0,center.z-radius,1));

													vMax.z = min(vMax.z, vPosB.z/vPosB.w);

												}

								#endif

												else

												{

													vMin = float3(-3,-3,-3);

													vMax = float3(-2,-2,-2);

												}

											}


											// we should consider doing a look-up here into a max depth mip chain

											// to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth

											//g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z);

											//g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z);


											// changed for unity

											g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, 0.5*vMin.y+0.5, vMin.z*VIEWPORT_SCALE_Z);

											g_vBoundsBuffer[lgtIndex+(int) g_iNrVisibLights] = float3(0.5*vMax.x+0.5, 0.5*vMax.y+0.5, vMax.z*VIEWPORT_SCALE_Z);

										}

									}

								}


								float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p);


								int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p)

								{

									int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;

									int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2;


									float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]);


									int nrVertsDst = 0;


									unsigned int uMask = (1<<p);

									bool bIsPrevVisib = (GetClip(vPrev)&uMask)==0;

									for(int i=0; i<iNrSrcVerts; i++)

									{

										float4 vCur = float4(posX[offs_src+i], posY[offs_src+i], posZ[offs_src+i], posW[offs_src+i]);

										bool bIsCurVisib = (GetClip(vCur)&uMask)==0;

										if( (bIsCurVisib && !bIsPrevVisib) || (!bIsCurVisib && bIsPrevVisib) )

										{

											//assert(nrVertsDst<MAX_PNTS);

											if(nrVertsDst<MAX_PNTS)

											{

												// generate new vertex

												float4 vNew = GenNewVert(bIsCurVisib ? vCur : vPrev, bIsCurVisib ? vPrev : vCur, p);

												posX[offs_dst+nrVertsDst]=vNew.x; posY[offs_dst+nrVertsDst]=vNew.y; posZ[offs_dst+nrVertsDst]=vNew.z; posW[offs_dst+nrVertsDst]=vNew.w;

												++nrVertsDst;

											}

										}


										if(bIsCurVisib)

										{

											//assert(nrVertsDst<MAX_PNTS);

											if(nrVertsDst<MAX_PNTS)

											{

												posX[offs_dst+nrVertsDst]=vCur.x; posY[offs_dst+nrVertsDst]=vCur.y; posZ[offs_dst+nrVertsDst]=vCur.z; posW[offs_dst+nrVertsDst]=vCur.w;

												++nrVertsDst;

											}

										}


										vPrev = vCur;

										bIsPrevVisib = bIsCurVisib;

									}


									return nrVertsDst;

								}


								unsigned int GetClip(const float4 P)

								{

									//-P.w <= P.x <= P.w

									return ((P.x<-P.w)?1:0) | ((P.x>P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0);

								}


								float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p)

								{

									const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1);

									const int index = ((uint) p)/2;

									float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z);

									float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z);


									//fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0;


									const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w));

									float4 vNew = vVisib*fT + vInvisib*(1-fT);


									// just to be really anal we make sure the clipped against coordinate is precise

									if(index==0) vNew.x = fS*vNew.w;

									else if(index==1) vNew.y = fS*vNew.w;

									else vNew.z = fS*vNew.w;


									return vNew;

								}


								float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane)

								{

									return mul(plane, InvProjection);

								}


								float4 EvalPlanePair(out bool validPlanes, float2 posXY_in, float r)

								{

									// rotate by 90 degrees to avoid potential division by zero

									bool bMustFlip = abs(posXY_in.y)<abs(posXY_in.x);

									float2 posXY = bMustFlip ? float2(-posXY_in.y, posXY_in.x) : posXY_in;


									float fLenSQ = dot(posXY, posXY);

									float diffSq = fLenSQ - r*r;

									float D = posXY.y * sqrt(max(0.0, diffSq));


									float4 res;

									res.x = (-r*posXY.x - D) / fLenSQ;

									res.z = (-r*posXY.x + D) / fLenSQ;

									res.y = (-r-res.x*posXY.x) / posXY.y;

									res.w = (-r-res.z*posXY.x) / posXY.y;


									// rotate back by 90 degrees

									res = bMustFlip ? float4(res.y, -res.x, res.w, -res.z) : res;


									validPlanes = diffSq>0.0;


									return res;

								}


								void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r)

								{

									bool validX, validY;

									float4 planeX = EvalPlanePair(validX, float2(pos_view_space.x, pos_view_space.z), r);

									float4 planeY = EvalPlanePair(validY, float2(pos_view_space.y, pos_view_space.z), r);


								#if USE_LEFTHAND_CAMERASPACE

									planeX = planeX.zwxy;		// need to swap left/right and top/bottom planes when using left hand system

									planeY = planeY.zwxy;

								#endif


									bIsMinValid = bool2(planeX.z<0, planeY.z<0) && bool2(validX,validY);

									bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0) && bool2(validX,validY);


									// hopefully the compiler takes zeros into account

									// should be the case since the transformation in TransformPlaneToPostSpace()

									// is done using multiply-adds and not dot product instructions.

									float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0));

									float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0));

									float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0));

									float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0));


									// convert planes to the forms (1,0,0,D) and (0,1,0,D)

									// 2D bound is given by -D components

									float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y);

									float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y);


									// Bound is complete

									vMin = B;

									vMax = A;

								}