您最多选择25个主题 主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

551 行
16 KiB

// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
// https://github.com/wolfgangfengel/GPU-Pro-7
#pragma kernel TileLightListGen
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs.hlsl"
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
uniform int g_iNrVisibLights;
uniform uint2 g_viDimensions;
uniform float4x4 g_mInvScrProjection;
uniform float4x4 g_mScrProjection;
Texture2D g_depth_tex : register( t0 );
StructuredBuffer<float3> g_vBoundsBuffer : register( t1 );
StructuredBuffer<SFiniteLightData> g_vLightData : register( t2 );
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
#define NR_THREADS 64
// output buffer
//RWBuffer<uint4> g_vLightList : register( u0 );
RWStructuredBuffer<uint> g_vLightList : register( u0 );
#define MAX_NR_COARSE_ENTRIES 64
#define MAX_NR_PRUNED_ENTRIES 24
groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS
groupshared uint ldsZMin;
groupshared uint ldsZMax;
groupshared uint lightOffs;
#ifdef FINE_PRUNING_ENABLED
groupshared uint ldsDoesLightIntersect[2];
#endif
groupshared int ldsNrLightsFinal;
groupshared int ldsModelListCount[NR_LIGHT_MODELS]; // since NR_LIGHT_MODELS is 2
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
groupshared uint lightOffsSph;
#endif
//float GetLinearDepth(float3 vP)
//{
// float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
// return v4Pres.z / v4Pres.w;
//}
float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far
{
float3 vP = float3(0.0f,0.0f,zDptBufSpace);
float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
return v4Pres.z / v4Pres.w;
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
float fSx = g_mScrProjection[0].x;
float fCx = g_mScrProjection[0].z;
float fSy = g_mScrProjection[1].y;
float fCy = g_mScrProjection[1].z;
#ifdef LEFT_HAND_COORDINATES
return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 );
#else
return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 );
#endif
}
float GetOnePixDiagWorldDistAtDepthOne()
{
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
}
void sortLightList(int localThreadID, int n);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
#endif
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
if(t<MAX_NR_COARSE_ENTRIES)
prunedList[t]=0;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
if(t==0)
{
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
}
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
float4 vLinDepths;
{
// Fetch depths and calculate min/max
[unroll]
for(int i = 0; i < 4; i++)
{
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(g_depth_tex, uCrd);
vLinDepths[i] = GetLinearDepth(fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
}
}
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
}
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;
// build coarse list using AABB
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
{
const float3 vMi = g_vBoundsBuffer[l];
const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights];
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
}
}
#ifdef FINE_PRUNING_ENABLED
if(t<2) ldsDoesLightIntersect[t] = 0;
#endif
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
int iNrCoarseLights = lightOffs<MAX_NR_COARSE_ENTRIES ? lightOffs : MAX_NR_COARSE_ENTRIES;
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
#endif
#ifndef FINE_PRUNING_ENABLED
{
int iNrLightsOut = iNrCoarseLights<MAX_NR_PRUNED_ENTRIES ? iNrCoarseLights : MAX_NR_PRUNED_ENTRIES;
if((int)t<iNrLightsOut) prunedList[t] = coarseList[t];
if(t==0) ldsNrLightsFinal=iNrLightsOut;
}
#else
{
uint uLightsFlags[2] = {0,0};
int l=0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
while(l<iNrCoarseLights)
{
// fetch light
int idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uint uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
// spot
while(l<iNrCoarseLights && uLgtType==SPOT_LIGHT)
{
SFiniteLightData lightData = g_vLightData[idxCoarse];
const bool bIsSpotDisc = (lightData.flags&IS_CIRCULAR_SPOT_SHAPE)!=0;
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 fromLight = vVPos-lightData.lightPos.xyz;
float distSq = dot(fromLight,fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );
float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1;
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
}
// sphere
while(l<iNrCoarseLights && uLgtType==SPHERE_LIGHT)
{
SFiniteLightData lightData = g_vLightData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight,toLight);
if(lightData.radiusSq>distSq) uVal = 1;
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
}
// Box
while(l<iNrCoarseLights && uLgtType==BOX_LIGHT)
{
SFiniteLightData lightData = g_vLightData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
}
// in case we have some corrupt data make sure we terminate
if(uLgtType>=MAX_TYPES) ++l;
}
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
if(t==0) ldsNrLightsFinal = 0;
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
}
}
#endif
//
if(t<NR_LIGHT_MODELS) ldsModelListCount[t]=0;
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
int nrLightsCombinedList = ldsNrLightsFinal<MAX_NR_COARSE_ENTRIES ? ldsNrLightsFinal : MAX_NR_COARSE_ENTRIES;
for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)
{
InterlockedAdd(ldsModelListCount[ g_vLightData[ prunedList[i] ].lightModel ], 1);
}
// sort lights
#if !defined(XBONE) && !defined(PLAYSTATION4)
sortLightList((int) t, nrLightsCombinedList);
#endif
// write lights to global buffers
int localOffs=0;
int offs = tileIDX.y*nrTilesX + tileIDX.x;
for(int m=0; m<NR_LIGHT_MODELS; m++)
{
int nrLightsFinal = ldsModelListCount[ m ];
int nrLightsFinalClamped = nrLightsFinal<MAX_NR_PRUNED_ENTRIES ? nrLightsFinal : MAX_NR_PRUNED_ENTRIES;
const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
for(l=(int) t; l<(int) nrDWords; l += NR_THREADS)
{
uint uLow = l==0 ? nrLightsFinalClamped : prunedList[2*l-1+localOffs];
uint uHigh = prunedList[2*l+0+localOffs];
g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16);
}
localOffs += nrLightsFinal;
offs += (nrTilesX*nrTilesY);
}
}
// original version
//float2 vRay2D = float2(max(V.x,V.y), fSclProj);
//float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D);
//if( all( float3(lightData.radiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lightData.fPenumbra, 0.0) ) ) uVal = 1;
// previous new version
//float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC);
//if( all( float3(lightData.radiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1;
#if 0
void merge(int l, int m, int r);
void sortLightList(int localThreadID, int n)
{
for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)
{
for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size))
{
int mid = left_start + curr_size - 1;
int right_end = min(left_start + 2*curr_size - 1, n-1);
merge(left_start, mid, right_end);
}
GroupMemoryBarrierWithGroupSync();
}
}
//groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES];
void merge(int l, int m, int r)
{
int i, j, k;
int ol = l;
int or = m+1;
int sl = m - l + 1; // capacity is size of left list = m - l + 1;
int sr = r - m; // capacity is size of right list = r - m
unsigned int tmpBuffer[] = coarseList; // re use coarse list buffer as temp buffer.
// could do this copy more efficiently before the if-statement
// in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync()
for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i];
i = 0;
j = 0;
k = l;
while (i < sl && j < sr)
{
const uint lVal = tmpBuffer[ol+i];
const uint rVal = tmpBuffer[or+j];
bool pickLeft = lVal <= rVal;
i = pickLeft ? (i+1) : i;
j = pickLeft ? j : (j+1);
prunedList[k] = pickLeft ? lVal : rVal;
k++;
}
while (i < sl)
{
prunedList[k] = tmpBuffer[ol+i];
i++; k++;
}
while (j < sr)
{
prunedList[k] = tmpBuffer[or+j];
j++; k++;
}
}
#else
// NOTE! returns 1 when value_in==0
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value;
}
void sortLightList(int localThreadID, int length)
{
// closest pow2 integer greater than or equal to length
const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES); // N is 1 when length is zero but will still not enter first for-loop
// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
for(int t=length+localThreadID; t<N; t+=NR_THREADS) { prunedList[t]=MAX_NR_COARSE_ENTRIES; } // impossible index
GroupMemoryBarrierWithGroupSync();
for(int k=2; k<=N; k=2*k)
{
for(int j=k>>1; j>0; j=j>>1)
{
for(int i=localThreadID; i<N; i+=NR_THREADS)
{
int ixj=i^j;
if((ixj)>i)
{
const unsigned int Avalue = prunedList[i];
const unsigned int Bvalue = prunedList[ixj];
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
if(mustSwap)
{
prunedList[i]=Bvalue;
prunedList[ixj]=Avalue;
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
#endif
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
lightOffsSph = 0;
// make a copy of coarseList in prunedList.
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
prunedList[l]=coarseList[l];
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
#ifdef LEFT_HAND_COORDINATES
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
#else
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float worldDistAtDepthOne = 8*onePixDiagDist; // scale by half a tile
int iNrVisib = 0;
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
{
SFiniteLightBound lightData = g_data[coarseList[l]];
const float3 center = lightData.center.xyz;
float fRad = lightData.radius;
#if 1
float3 maxZdir = float3(-center.z*center.x, -center.z*center.y, center.x*center.x + center.y*center.y); // cross(center,cross(Zaxis,center))
float len = length(maxZdir);
float scalarProj = len>0.0001 ? (maxZdir.z/len) : len; // since len>=(maxZdir.z/len) we can use len as an approximate value when len<=epsilon
float fOffs = scalarProj*fRad;
#else
float fOffs = fRad; // more false positives due to larger radius but works too
#endif
#ifdef LEFT_HAND_COORDINATES
fRad = fRad + (center.z+fOffs)*worldDistAtDepthOne;
#else
fRad = fRad + (center.z-fOffs)*worldDistAtDepthOne;
#endif
float a = dot(V,V);
float CdotV = dot(center,V);
float c = dot(center,center) - fRad*fRad;
float fDescDivFour = CdotV*CdotV - a*c;
if(c<0 || (fDescDivFour>0 && CdotV>0)) // if ray hit bounding sphere
{
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
}
}
#if !defined(XBONE) && !defined(PLAYSTATION4)
GroupMemoryBarrierWithGroupSync();
#endif
return lightOffsSph;
}
#endif