您最多选择25个主题 主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

498 行
18 KiB

// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
// https://github.com/wolfgangfengel/GPU-Pro-7
#pragma kernel TileLightListGen LIGHTLISTGEN=TileLightListGen
#pragma kernel TileLightListGen_SrcBigTile LIGHTLISTGEN=TileLightListGen_SrcBigTile USE_TWO_PASS_TILED_LIGHTING
#pragma kernel TileLightListGen_FeatureFlags LIGHTLISTGEN=TileLightListGen_FeatureFlags USE_FEATURE_FLAGS
#pragma kernel TileLightListGen_SrcBigTile_FeatureFlags LIGHTLISTGEN=TileLightListGen_SrcBigTile_FeatureFlags USE_TWO_PASS_TILED_LIGHTING USE_FEATURE_FLAGS
#pragma kernel TileLightListGen_Oblique LIGHTLISTGEN=TileLightListGen_Oblique USE_OBLIQUE_MODE
#pragma kernel TileLightListGen_SrcBigTile_Oblique LIGHTLISTGEN=TileLightListGen_SrcBigTile_Oblique USE_TWO_PASS_TILED_LIGHTING USE_OBLIQUE_MODE
#pragma kernel TileLightListGen_FeatureFlags_Oblique LIGHTLISTGEN=TileLightListGen_FeatureFlags_Oblique USE_FEATURE_FLAGS USE_OBLIQUE_MODE
#pragma kernel TileLightListGen_SrcBigTile_FeatureFlags_Oblique LIGHTLISTGEN=TileLightListGen_SrcBigTile_FeatureFlags_Oblique USE_TWO_PASS_TILED_LIGHTING USE_FEATURE_FLAGS USE_OBLIQUE_MODE
//#pragma #pragma enable_d3d11_debug_symbols
#include "CoreRP/ShaderLibrary/Common.hlsl"
#include "ShaderBase.hlsl"
#include "LightLoop.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
#include "SortingComputeUtils.hlsl"
#endif
#pragma only_renderers d3d11 ps4 xboxone vulkan metal switch
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
uniform int g_iNrVisibLights;
uniform uint2 g_viDimensions;
uniform float4x4 g_mInvScrProjection;
uniform float4x4 g_mScrProjection;
uniform uint g_isOrthographic;
uniform int _EnvLightIndexShift;
uniform int _DecalIndexShift;
uniform uint g_BaseFeatureFlags;
Texture2D g_depth_tex : register( t0 );
StructuredBuffer<float4> g_vBoundsBuffer : register( t1 );
StructuredBuffer<LightVolumeData> _LightVolumeData : register(t2);
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
#ifdef USE_TWO_PASS_TILED_LIGHTING
StructuredBuffer<uint> g_vBigTileLightList : register( t4 ); // don't support Buffer yet in unity
#endif
#define NR_THREADS 64
// output buffer
RWStructuredBuffer<uint> g_vLightList : register( u0 ); // don't support RWBuffer yet in unity
#define MAX_NR_COARSE_ENTRIES 64
#define MAX_NR_PRUNED_ENTRIES 24
#define CATEGORY_LIST_SIZE (LIGHTCATEGORY_COUNT - 1) // Skip density volumes
groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS
groupshared uint ldsZMin;
groupshared uint ldsZMax;
groupshared uint lightOffs;
#ifdef FINE_PRUNING_ENABLED
groupshared uint ldsDoesLightIntersect[2];
#endif
groupshared int ldsNrLightsFinal;
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
groupshared uint lightOffsSph;
#endif
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif
float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far
{
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
bool isOrthographic = g_isOrthographic!=0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
#else
bool useLeftHandVersion = isOrthographic;
#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}
float GetOnePixDiagWorldDistAtDepthOne()
{
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
}
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
#endif
#ifdef FINE_PRUNING_ENABLED
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths);
#endif
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
if(t<MAX_NR_COARSE_ENTRIES)
prunedList[t]=0;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
uint nrTiles = nrTilesX * nrTilesY; // Precompute?
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
if(t==0)
{
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
}
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
float4 vLinDepths;
{
// Fetch depths and calculate min/max
UNITY_UNROLL
for(int i = 0; i < 4; i++)
{
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(g_depth_tex, uCrd);
vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
}
}
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
}
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;
// build coarse list using AABB
#ifdef USE_TWO_PASS_TILED_LIGHTING
const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);
int NrBigTilesX = (nrTilesX+((1<<log2BigTileToTileRatio)-1))>>log2BigTileToTileRatio;
const int bigTileIdx = (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
#else
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
{
#endif
// Skip density volumes (lights are sorted by category). TODO: improve data locality
if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_DENSITY_VOLUME) { break; }
const float3 vMi = g_vBoundsBuffer[l].xyz;
const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights].xyz;
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
}
}
#ifdef FINE_PRUNING_ENABLED
if(t<2) ldsDoesLightIntersect[t] = 0;
#endif
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
#endif
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights) prunedList[t] = coarseList[t];
if(t==0) ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
if(t<CATEGORY_LIST_SIZE) ldsCategoryListCount[t]=0;
#ifdef USE_FEATURE_FLAGS
if(t==0) ldsFeatureFlags=0;
#endif
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
int nrLightsCombinedList = min(ldsNrLightsFinal,MAX_NR_COARSE_ENTRIES);
for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)
{
InterlockedAdd(ldsCategoryListCount[_LightVolumeData[prunedList[i]].lightCategory], 1);
#ifdef USE_FEATURE_FLAGS
InterlockedOr(ldsFeatureFlags, _LightVolumeData[prunedList[i]].featureFlags);
#endif
}
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif
#ifdef USE_FEATURE_FLAGS
if(t == 0)
{
uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
// In case of back
if(ldsZMax < ldsZMin) // is background pixel
{
// There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing
featureFlags = 0;
}
g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x] = featureFlags;
}
#endif
// write lights to global buffers
int localOffs=0;
int offs = tileIDX.y*nrTilesX + tileIDX.x;
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
int shiftIndex[CATEGORY_LIST_SIZE];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
shiftIndex[CATEGORY_LIST_SIZE - 2] = _EnvLightIndexShift;
shiftIndex[CATEGORY_LIST_SIZE - 1] = _DecalIndexShift;
for(int category=0; category<CATEGORY_LIST_SIZE; category++)
{
int nrLightsFinal = ldsCategoryListCount[category];
int nrLightsFinalClamped = nrLightsFinal<MAX_NR_PRUNED_ENTRIES ? nrLightsFinal : MAX_NR_PRUNED_ENTRIES;
const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)
{
// We remap the prunedList index to the original LightData / EnvLightData indices
uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category];
uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];
g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16);
}
localOffs += nrLightsFinal;
offs += (nrTilesX*nrTilesY);
}
}
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
if(threadID==0) lightOffsSph = 0;
// make a copy of coarseList in prunedList.
int l;
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
prunedList[l]=coarseList[l];
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
#if USE_LEFT_HAND_CAMERA_SPACE
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
#else
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile
for(l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
{
SFiniteLightBound lightData = g_data[prunedList[l]];
if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic!=0) )
{
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
}
}
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
return lightOffsSph;
}
#endif
#ifdef FINE_PRUNING_ENABLED
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths)
{
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint uLightsFlags[2] = {0,0};
int l=0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
while(l<iNrCoarseLights)
{
// fetch light
int idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uint uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
// spot
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_CONE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
// TODO: Change by SebL
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 fromLight = vVPos-lightData.lightPos.xyz;
float distSq = dot(fromLight,fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );
float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1;
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
}
// sphere
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_SPHERE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight,toLight);
if(lightData.radiusSq>distSq) uVal = 1;
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
}
// Box
while(l<iNrCoarseLights && uLightVolume==LIGHTVOLUMETYPE_BOX)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
{
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists
}
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLightVolume = l<iNrCoarseLights ? _LightVolumeData[idxCoarse].lightVolume : 0;
}
// in case we have some corrupt data make sure we terminate
if(uLightVolume >=LIGHTVOLUMETYPE_COUNT) ++l;
}
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
if(t==0) ldsNrLightsFinal = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
}
}
#endif