您最多选择25个主题 主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

497 行
16 KiB

#pragma kernel ScreenBoundsAABB
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs"
uniform int g_iNrVisibLights;
uniform float4x4 g_mInvProjection;
uniform float4x4 g_mProjection;
StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
#define FLT_EPSILON 1.192092896e-07F // smallest such that 1.0+FLT_EPSILON != 1.0
#define NR_THREADS 64
// output buffer
RWStructuredBuffer<float3> g_vBoundsBuffer : register( u0 );
#define MAX_PNTS 9 // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
// However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane
// clipping gets skipped which doesn't cause any errors.
// LDS (2496 bytes)
groupshared float posX[MAX_PNTS*8*2];
groupshared float posY[MAX_PNTS*8*2];
groupshared float posZ[MAX_PNTS*8*2];
groupshared float posW[MAX_PNTS*8*2];
groupshared unsigned int clipFlags[48];
unsigned int GetClip(const float4 P);
int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p);
void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r);
void GetQuad(out float3 p0, out float3 p1, out float3 p2, out float3 p3, const float3 vBoxX, const float3 vBoxY, const float3 vBoxZ, const float3 vCen, const float2 vScaleXY, const int sideIndex);
[numthreads(NR_THREADS, 1, 1)]
void ScreenBoundsAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint groupID = u3GroupID.x;
//uint vindex = groupID * NR_THREADS + threadID;
unsigned int g = groupID;
unsigned int t = threadID;
const int subLigt = (int) (t/8);
const int lgtIndex = subLigt+(int) g*8;
const int sideIndex = (int) (t%8);
SFiniteLightBound lgtDat = g_data[lgtIndex];
const float3 vBoxX = lgtDat.vBoxAxisX.xyz;
const float3 vBoxY = lgtDat.vBoxAxisY.xyz;
const float3 vBoxZ = -lgtDat.vBoxAxisZ.xyz; // flip an axis to make it right handed since Determinant(worldToView)<0
const float3 vCen = lgtDat.vCen.xyz;
const float fRadius = lgtDat.fRadius;
const float2 vScaleXY = lgtDat.vScaleXY;
{
if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights) // mask 2 out of 8 threads
{
float3 q0, q1, q2, q3;
GetQuad(q0, q1, q2, q3, vBoxX, vBoxY, vBoxZ, vCen, vScaleXY, sideIndex);
const float4 vP0 = mul(g_mProjection, float4(q0, 1));
const float4 vP1 = mul(g_mProjection, float4(q1, 1));
const float4 vP2 = mul(g_mProjection, float4(q2, 1));
const float4 vP3 = mul(g_mProjection, float4(q3, 1));
// test vertices of one quad (of the convex hull) for intersection
const unsigned int uFlag0 = GetClip(vP0);
const unsigned int uFlag1 = GetClip(vP1);
const unsigned int uFlag2 = GetClip(vP2);
const unsigned int uFlag3 = GetClip(vP3);
const float4 vPnts[] = {vP0, vP1, vP2, vP3};
// screen-space AABB of one quad (assuming no intersection)
float3 vMin, vMax;
for(int k=0; k<4; k++)
{
float fW = vPnts[k].w;
float fS = fW<0 ? -1 : 1;
float fWabs = fW<0 ? (-fW) : fW;
fW = fS * (fWabs<FLT_EPSILON ? FLT_EPSILON : fWabs);
float3 vP = float3(vPnts[k].x/fW, vPnts[k].y/fW, vPnts[k].z/fW);
if(k==0) { vMin=vP; vMax=vP; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
}
clipFlags[subLigt*6+sideIndex] = (uFlag0<<0) | (uFlag1<<6) | (uFlag2<<12) | (uFlag3<<18);
// store in clip buffer (only use these vMin and vMax if light is 100% visible in which case clipping isn't needed)
posX[subLigt*MAX_PNTS*2 + sideIndex] = vMin.x;
posY[subLigt*MAX_PNTS*2 + sideIndex] = vMin.y;
posZ[subLigt*MAX_PNTS*2 + sideIndex] = vMin.z;
posX[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.x;
posY[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.y;
posZ[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.z;
}
}
// if not XBONE and not PLAYSTATION4 we need a memorybarrier here
// since we can't rely on the gpu cores being 64 wide.
// We need a pound define around this.
GroupMemoryBarrierWithGroupSync();
{
int f=0;
if(sideIndex==0 && lgtIndex<(int) g_iNrVisibLights)
{
// quick acceptance or rejection
unsigned int uCollectiveAnd = (unsigned int) -1;
unsigned int uCollectiveOr = 0;
for(f=0; f<6; f++)
{
unsigned int uFlagAnd = clipFlags[subLigt*6+f]&0x3f;
unsigned int uFlagOr = uFlagAnd;
for(int i=1; i<4; i++)
{
unsigned int uClipBits = (clipFlags[subLigt*6+f]>>(i*6))&0x3f;
uFlagAnd &= uClipBits;
uFlagOr |= uClipBits;
}
uCollectiveAnd &= uFlagAnd;
uCollectiveOr |= uFlagOr;
}
bool bSetBoundYet = false;
float3 vMin=0.0, vMax=0.0;
if(uCollectiveAnd!=0 || uCollectiveOr==0) // all invisible or all visible (early out)
{
if(uCollectiveOr==0) // all visible
{
for(f=0; f<6; f++)
{
const int sideIndex = f;
float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]);
float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]);
for(int k=0; k<2; k++)
{
float3 vP = k==0 ? vFaceMi : vFaceMa;
if(f==0 && k==0) { vMin=vP; vMax=vP; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
}
}
bSetBoundYet=true;
}
}
else // :( need true clipping
{
for(f=0; f<6; f++)
{
float3 q0, q1, q2, q3;
GetQuad(q0, q1, q2, q3, vBoxX, vBoxY, vBoxZ, vCen, vScaleXY, f);
// 4 vertices to a quad of the convex hull in post projection space
const float4 vP0 = mul(g_mProjection, float4(q0, 1));
const float4 vP1 = mul(g_mProjection, float4(q1, 1));
const float4 vP2 = mul(g_mProjection, float4(q2, 1));
const float4 vP3 = mul(g_mProjection, float4(q3, 1));
int iSrcIndex = 0;
int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
// fill up source clip buffer with the quad
posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x;
posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y;
posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z;
posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w;
int iNrSrcVerts = 4;
// do true clipping
for(int p=0; p<6; p++)
{
const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p);
iSrcIndex = 1-iSrcIndex;
iNrSrcVerts = nrVertsDst;
if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break;
}
// final clipped convex primitive is in src buffer
if(iNrSrcVerts>2)
{
int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
for(int k=0; k<iNrSrcVerts; k++)
{
float4 vCur = float4(posX[offs_src+k], posY[offs_src+k], posZ[offs_src+k], posW[offs_src+k]);
// project and apply toward AABB
float3 vP = float3(vCur.x/vCur.w, vCur.y/vCur.w, vCur.z/vCur.w);
if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
}
}
}
////////////////////// look for camera frustum verts that need to be included. That is frustum vertices inside the convex hull for the light
int i=0;
for(i=0; i<8; i++) // establish 8 camera frustum vertices
{
float3 vVertPSpace = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
float4 v4ViewSpace = mul(g_mInvProjection, float4(vVertPSpace,1));
float3 vViewSpace = float3(v4ViewSpace.x/v4ViewSpace.w, v4ViewSpace.y/v4ViewSpace.w, v4ViewSpace.z/v4ViewSpace.w);
posX[subLigt*MAX_PNTS*2 + i] = vViewSpace.x;
posY[subLigt*MAX_PNTS*2 + i] = vViewSpace.y;
posZ[subLigt*MAX_PNTS*2 + i] = vViewSpace.z;
}
// determine which camera frustum vertices are inside the convex hull
uint uVisibFl = 0xff;
for(f=0; f<6; f++)
{
float3 vP0, vP1, vP2, vP3;
GetQuad(vP0, vP1, vP2, vP3, vBoxX, vBoxY, vBoxZ, vCen, vScaleXY, f);
// one edge might be zero length so we do all 4
float3 vN = cross(vP1-vP0, vP3-vP0) + cross(vP2-vP1, vP0-vP1) + cross(vP3-vP2, vP1-vP2) + cross(vP0-vP3, vP2-vP3);
float fLen = length(vN);
if(fLen>1) vN = normalize(vN); // this won't necessarily be a non zero vector (spot lights have all 4 top points as the same)
for(i=0; i<8; i++)
{
float3 vViewSpace = float3(posX[subLigt*MAX_PNTS*2 + i], posY[subLigt*MAX_PNTS*2 + i], posZ[subLigt*MAX_PNTS*2 + i]);
#ifdef LEFT_HAND_COORDINATES
uVisibFl &= ( dot(vViewSpace-vP0, vN)<0 ? 0xff : (~(1<<i)) );
#else
uVisibFl &= ( dot(vViewSpace-vP0, vN)>0 ? 0xff : (~(1<<i)) );
#endif
}
}
// apply camera frustum vertices inside the convex hull to the AABB
for(i=0; i<8; i++)
{
if((uVisibFl&(1<<i))!=0)
{
float3 vP = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
}
}
}
// determine AABB bound in [-1;1]x[-1;1] screen space using bounding sphere.
// Use the result to make our already established AABB from the convex hull
// potentially tighter.
if(!bSetBoundYet)
{
// set the AABB off-screen
vMin = float3(-3,-3,-3);
vMax = float3(-2,-2,-2);
}
else
{
//if((vCen.z+fRadius)<0.0)
if( length(vCen)>fRadius)
{
float2 vMi, vMa;
bool2 bMi, bMa;
CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, vCen, fRadius);
vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy;
vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy;
}
#ifdef LEFT_HAND_COORDINATES
if((vCen.z-fRadius)>0.0)
{
float4 vPosF = mul(g_mProjection, float4(0,0,vCen.z-fRadius,1));
vMin.z = max(vMin.z, vPosF.z/vPosF.w);
}
if((vCen.z+fRadius)>0.0)
{
float4 vPosB = mul(g_mProjection, float4(0,0,vCen.z+fRadius,1));
vMax.z = min(vMax.z, vPosB.z/vPosB.w);
}
#else
if((vCen.z+fRadius)<0.0)
{
float4 vPosF = mul(g_mProjection, float4(0,0,vCen.z+fRadius,1));
vMin.z = max(vMin.z, vPosF.z/vPosF.w);
}
if((vCen.z-fRadius)<0.0)
{
float4 vPosB = mul(g_mProjection, float4(0,0,vCen.z-fRadius,1));
vMax.z = min(vMax.z, vPosB.z/vPosB.w);
}
#endif
else
{
vMin = float3(-3,-3,-3);
vMax = float3(-2,-2,-2);
}
}
// we should consider doing a look-up here into a max depth mip chain
// to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth
//g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z);
//g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z);
// changed for unity
g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, 0.5*vMin.y+0.5, vMin.z*VIEWPORT_SCALE_Z);
g_vBoundsBuffer[lgtIndex+(int) g_iNrVisibLights] = float3(0.5*vMax.x+0.5, 0.5*vMax.y+0.5, vMax.z*VIEWPORT_SCALE_Z);
}
}
}
float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p);
int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p)
{
int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2;
float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]);
int nrVertsDst = 0;
unsigned int uMask = (1<<p);
bool bIsPrevVisib = (GetClip(vPrev)&uMask)==0;
for(int i=0; i<iNrSrcVerts; i++)
{
float4 vCur = float4(posX[offs_src+i], posY[offs_src+i], posZ[offs_src+i], posW[offs_src+i]);
bool bIsCurVisib = (GetClip(vCur)&uMask)==0;
if( (bIsCurVisib && !bIsPrevVisib) || (!bIsCurVisib && bIsPrevVisib) )
{
//assert(nrVertsDst<MAX_PNTS);
if(nrVertsDst<MAX_PNTS)
{
// generate new vertex
float4 vNew = GenNewVert(bIsCurVisib ? vCur : vPrev, bIsCurVisib ? vPrev : vCur, p);
posX[offs_dst+nrVertsDst]=vNew.x; posY[offs_dst+nrVertsDst]=vNew.y; posZ[offs_dst+nrVertsDst]=vNew.z; posW[offs_dst+nrVertsDst]=vNew.w;
++nrVertsDst;
}
}
if(bIsCurVisib)
{
//assert(nrVertsDst<MAX_PNTS);
if(nrVertsDst<MAX_PNTS)
{
posX[offs_dst+nrVertsDst]=vCur.x; posY[offs_dst+nrVertsDst]=vCur.y; posZ[offs_dst+nrVertsDst]=vCur.z; posW[offs_dst+nrVertsDst]=vCur.w;
++nrVertsDst;
}
}
vPrev = vCur;
bIsPrevVisib = bIsCurVisib;
}
return nrVertsDst;
}
unsigned int GetClip(const float4 P)
{
//-P.w <= P.x <= P.w
return ((P.x<-P.w)?1:0) | ((P.x>P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0);
}
float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p)
{
const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1);
const int index = ((uint) p)/2;
float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z);
float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z);
//fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0;
const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w));
float4 vNew = vVisib*fT + vInvisib*(1-fT);
// just to be really anal we make sure the clipped against coordinate is precise
if(index==0) vNew.x = fS*vNew.w;
else if(index==1) vNew.y = fS*vNew.w;
else vNew.z = fS*vNew.w;
return vNew;
}
void GetQuad(out float3 p0, out float3 p1, out float3 p2, out float3 p3, const float3 vBoxX, const float3 vBoxY, const float3 vBoxZ, const float3 vCen, const float2 vScaleXY, const int sideIndex)
{
const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);
const float fS = (sideIndex & 1) != 0 ? 1 : (-1);
float3 vA = fS*(iAbsSide == 0 ? vBoxX : (iAbsSide == 1 ? (-vBoxY) : vBoxZ));
float3 vB = fS*(iAbsSide == 0 ? (-vBoxY) : (iAbsSide == 1 ? (-vBoxX) : (-vBoxY)));
float3 vC = iAbsSide == 0 ? vBoxZ : (iAbsSide == 1 ? vBoxZ : (-vBoxX));
bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0; // in this case all 4 verts get scaled.
bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1); // if side quad only two verts get scaled (impacts q1 and q2)
if (bIsTopQuad) { vB *= vScaleXY.y; vC *= vScaleXY.x; }
float3 vA2 = vA;
float3 vB2 = vB;
if (bIsSideQuad) { vA2 *= (iAbsSide == 0 ? vScaleXY.x : vScaleXY.y); vB2 *= (iAbsSide == 0 ? vScaleXY.y : vScaleXY.x); }
p0 = vCen + vA + vB - vC; // vCen + vA is center of face when vScaleXY is 1.0
p1 = vCen + vA2 + vB2 + vC;
p2 = vCen + vA2 - vB2 + vC;
p3 = vCen + vA - vB - vC;
}
float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane)
{
return mul(plane, InvProjection);
}
float4 EvalPlanePair(float2 posXY_in, float r)
{
// rotate by 90 degrees to avoid potential division by zero
bool bMustFlip = abs(posXY_in.y)<abs(posXY_in.x);
float2 posXY = bMustFlip ? float2(-posXY_in.y, posXY_in.x) : posXY_in;
float fLenSQ = dot(posXY, posXY);
float D = posXY.y * sqrt(fLenSQ - r*r);
float4 res;
res.x = (-r*posXY.x - D) / fLenSQ;
res.z = (-r*posXY.x + D) / fLenSQ;
res.y = (-r-res.x*posXY.x) / posXY.y;
res.w = (-r-res.z*posXY.x) / posXY.y;
// rotate back by 90 degrees
res = bMustFlip ? Vec4(res.y, -res.x, res.w, -res.z) : res;
return res;
}
void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r)
{
float4 planeX = EvalPlanePair(float2(pos_view_space.x, pos_view_space.z), r);
float4 planeY = EvalPlanePair(float2(pos_view_space.y, pos_view_space.z), r);
#ifdef LEFT_HAND_COORDINATES
planeX = planeX.zwxy; // need to swap left/right and top/bottom planes when using left hand system
planeY = planeY.zwxy;
#endif
bIsMinValid = bool2(planeX.z<0, planeY.z<0);
bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0);
// hopefully the compiler takes zeros into account
// should be the case since the transformation in TransformPlaneToPostSpace()
// is done using multiply-adds and not dot product instructions.
float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0));
float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0));
float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0));
float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0));
// convert planes to the forms (1,0,0,D) and (0,1,0,D)
// 2D bound is given by -D components
float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y);
float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y);
// Bound is complete
vMin = B;
vMax = A;
}