#include "ShaderBase.hlsl"
#include "LightLoop.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#include "LightCullUtils.hlsl"
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
#include "SortingComputeUtils.hlsl"
CBUFFER_START(UnityLightListClustered)
int g_iNrVisibLights;
float4x4 g_mInvScrProjection;
float4x4 g_mScrProjection;
float4x4 g_mInvScrProjectionArr[2];
float4x4 g_mScrProjectionArr[2];
uint g_isOrthographic;
int _EnvLightIndexShift;
int _DecalIndexShift;
CBUFFER_END
// ClusteredUtils.hlsl is dependent on the constants declared in UnityLightListClustered :/
// g_fClustBase, g_fNearPlane, g_fFarPlane, g_iLog2NumClusters
#ifdef MSAA_ENABLED
Texture2DMS<float> g_depth_tex : register( t0 );
#define NR_THREADS 64
// output buffer
RWStructuredBuffer<uint> g_vLayeredLightList : register( u0 ); // don't support RWBuffer yet in unity
RWStructuredBuffer<uint> g_LayeredOffset : register( u1 ); // don't support RWBuffer yet in unity
RWStructuredBuffer<uint> g_LayeredSingleIdxBuffer : register( u2 ); // don't support RWBuffer yet in unity
groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES/2];
groupshared float4 lightPlanes[4*6];
groupshared float4 lightPlanes[4*6]; // Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)
groupshared uint lightOffs;
groupshared uint lightOffsSph;
#endif
float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far
float GetLinearDepth(float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
//return v4Pres.z / v4Pres.w;
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];
bool isOrthographic = g_isOrthographic!=0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}
float GetOnePixDiagWorldDistAtDepthOne()
float GetOnePixDiagWorldDistAtDepthOne(uint eyeIndex)
float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
// SphericalIntersectionTests and CullByExactEdgeTests are close to the versions
// in lightlistbuild-bigtile.compute. But would need more re-factoring than needed
// right now.
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane);
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex);
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex);
float4 FetchPlane(int l, int p);
float4 FetchPlane(int l, int p, uint eyeIndex);
bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase)
bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
// If this light's screen space depth bounds intersect this cluster...simple cluster test
// TODO: Unify this code with the code in CheckIntersectionBasic...
unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
bool bIsHit = ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff);
if(bIsHit)
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
float z = (i&4)==0 ? depthAtNearZ : depthAtFarZ;
float3 vP = GetViewPosFromLinDepth( float2(x, y), z);
float3 vP = GetViewPosFromLinDepth( float2(x, y), z, eyeIndex);
// Test each corner of the cluster against the light bounding box planes
bAllInvisib = bAllInvisib && dot(plane, float4(vP,1.0))>0;
}
return bIsHit;
}
// l is the coarse light index, k is the cluster index
bool CheckIntersectionBasic(int l, int k)
{
unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint eyeIndex = u3GroupID.z;
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
// Screen space coordinates of clustered tile
uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
for(int idx=t; idx<(TILE_SIZE_CLUSTERED*TILE_SIZE_CLUSTERED); idx+=NR_THREADS)
{
// XRTODO: We need to stereo-ize access to g_depth_tex for texture arrays.
// TODO: For stereo double-wide, I need a proper way to insert the second eye width offset. Right now, I can just
// use g_screenSize.x, but that's kinda cheating.
// Additionally, we're going to have a method to select between a doublewide texture or texture array. Doubling
// the kernels seems like a bad idea. We could branch our texture read to switch between different texture declarations.
uint stereoDWOffset = eyeIndex * g_screenSize.x;
uPixCrd.x += stereoDWOffset;
#ifdef MSAA_ENABLED
for(int i=0; i<g_iNumSamplesMSAA; i++)
{
#endif
}
// Why is this a uint? Doesn't InterlockedMax support shared mem floats?
InterlockedMax(ldsZMax, asuint(dpt_ma) );
if(dpt_ma<=0.0) dpt_ma = VIEWPORT_SCALE_Z; // assume sky pixel
#endif
// 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
float2 vTileLL = float2(viTilLL.x/g_screenSize.x, viTilLL.y/g_screenSize.y);
float2 vTileUR = float2(viTilUR.x/g_screenSize.x, viTilUR.y/g_screenSize.y);
int NrBigTilesX = (nrTilesX+((1<<log2BigTileToClustTileRatio)-1))>>log2BigTileToClustTileRatio;
const int bigTileIdx = (tileIDX.y>>log2BigTileToClustTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToClustTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
int NrBigTilesX = (nrTilesX + ((1<<log2BigTileToClustTileRatio)-1)) >> log2BigTileToClustTileRatio;
int NrBigTilesY = (nrTilesY + ((1<<log2BigTileToClustTileRatio)-1)) >> log2BigTileToClustTileRatio;
const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;
const int bigTileIdx = bigTileBase + ((tileIDX.y>>log2BigTileToClustTileRatio)*NrBigTilesX) + (tileIDX.x>>log2BigTileToClustTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
#endif
const float2 vMi = g_vBoundsBuffer[l].xy;
const float2 vMa = g_vBoundsBuffer[l+g_iNrVisibLights].xy;
// TODO: Seems kinda funny that we repeat this exact code here, bigtile, and FPTL...
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(TILE_SIZE_CLUSTERED/2,TILE_SIZE_CLUSTERED/2), uint2(g_screenSize.x-1, g_screenSize.y-1))) );
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(TILE_SIZE_CLUSTERED/2,TILE_SIZE_CLUSTERED/2), uint2(g_screenSize.x-1, g_screenSize.y-1))), eyeIndex );
float fTileFarPlane = GetLinearDepth(dpt_ma );
#else
float fTileFarPlane = -GetLinearDepth(dpt_ma );
float fTileFarPlane = GetLinearDepth(dpt_ma, eyeIndex );
#else // USE_LEFT_HAND_CAMERA_SPACE
float fTileFarPlane = -GetLinearDepth(dpt_ma, eyeIndex );
#else
#else // ENABLE_DEPTH_TEXTURE_BACKPLANE
float fTileFarPlane = g_fFarPlane;
float suggestedBase = g_fClustBase;
#endif
iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane);
iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);
// NOTE: Why not sort on console?
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
#endif
// TODO: We should write some encode/decode functions to help put cluster indices into the shared mem buffer,
// and extract them later. The code that reads from clusterIdx is hairy.
const unsigned int clustIdxMi0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0].z), suggestedBase));
const unsigned int clustIdxMa0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0+g_iNrVisibLights].z), suggestedBase));
const unsigned int clustIdxMi1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1].z), suggestedBase));
const unsigned int clustIdxMa1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1+g_iNrVisibLights].z), suggestedBase));
const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);
const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);
const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0Bounds.min].z, eyeIndex), suggestedBase));
const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0Bounds.max].z, eyeIndex), suggestedBase));
const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1Bounds.min].z, eyeIndex), suggestedBase));
const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1Bounds.max].z, eyeIndex), suggestedBase));
clusterIdxs[l] = (clustIdxMa1<<24) | (clustIdxMi1<<16) | (clustIdxMa0<<8) | (clustIdxMi0<<0);
}
}
int iSum = 0;
if(i<nrClusters)
{
// Each thread checks it's respective cluster against all coarse lights for intersection.
// At the end, 'iSum' represents the number of lights that intersect this cluster!
// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
// want to allocate out of g_LayeredSingleIdxBuffer.
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint) iSpaceAvail, start); // alloc list memory
}
int shiftIndex[LIGHTCATEGORY_COUNT];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, LIGHTCATEGORY_COUNT);
// NOTE: Why is this indexed like this?
if(i<24) lightPlanes[6*m+p] = FetchPlane(min(iNrCoarseLights-1,ll+m), p);
if(i<24) lightPlanes[6*m+p] = FetchPlane(min(iNrCoarseLights-1,ll+m), p, eyeIndex);
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
if(offs<(start+iSpaceAvail) && i<nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase) )
if(offs<(start+iSpaceAvail) && i<nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex) )
uint lightCategory = _LightVolumeData[coarseList[l]].lightCategory;
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
}
}
}
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
uint localOffs=0;
offs = i*nrTilesX*nrTilesY + tileIDX.y*nrTilesX + tileIDX.x;
offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);
for(int category=0; category<LIGHTCATEGORY_COUNT; category++)
{
int numLights = min(categoryListCount[category],31); // only allow 5 bits
}
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
if(threadID==0) g_logBaseBuffer[tileIDX.y*nrTilesX + tileIDX.x] = suggestedBase;
const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);
if(threadID==0) g_logBaseBuffer[logBaseIndex] = suggestedBase;
float4 FetchPlane(int l, int p)
float4 FetchPlane(int l, int p, uint eyeIndex)
SFiniteLightBound lgtDat = g_data[coarseList[l]];
const int lightBoundIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
SFiniteLightBound lgtDat = g_data[lightBoundIndex];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex)
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0, eyeIndex);
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0, eyeIndex);
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(eyeIndex);
SFiniteLightBound lgtDat = g_data[coarseList[l] ];
const int lightBoundIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
SFiniteLightBound lgtDat = g_data[lightBoundIndex ];
if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )
coarseList[l]=UINT_MAX;
#ifdef EXACT_EDGE_TESTS
float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane)
float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane, uint eyeIndex )
{
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
#endif
return GetViewPosFromLinDepth( float2(x, y), z );
return GetViewPosFromLinDepth( float2(x, y), z, eyeIndex );
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex )
vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane);
vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane, eyeIndex );
#if USE_LEFT_HAND_CAMERA_SPACE
float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,1.0);
vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0)));
}
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex )
{
if(threadID==0) lightOffs2 = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
GroupMemoryBarrierWithGroupSync();
#endif
const int idxCoarse = coarseList[l] ;
UNITY_BRANCH if (_LightVolumeData[idxCoars e].lightVolume != LIGHTVOLUMETYPE_SPHERE) // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
const int lightCullIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex) ;
UNITY_BRANCH if (_LightVolumeData[l ightCullIn dex ].lightVolume != LIGHTVOLUMETYPE_SPHERE) // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
SFiniteLightBound lgtDat = g_data[idxCoarse ];
SFiniteLightBound lgtDat = g_data[lightCullIndex ];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
float3 vP1, vE1;
GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, fTileFarPlane);
GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, fTileFarPlane, eyeIndex );
// potential separation plane
float3 vN = cross(vE0, vE1);
positive=0; negative=0;
for(int j=0; j<8; j++)
{
float3 vPf = GetTileVertex(viTilLL, viTilUR, j, fTileFarPlane);
float3 vPf = GetTileVertex(viTilLL, viTilUR, j, fTileFarPlane, eyeIndex );
float fSignDist = dot(vN, vPf-vP0);
if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;
}