Extensive notes about what and how to stereo-ize clustered light gen

Strictly comments, but this lays out the infrastructure needed to support stereo Basically: * Screen-space texture? Generate stereo corrected coordinates * Light cull data (view dependent)? Generate stereo corrected indices * BigTile lookup? Generate stereo corrected indices * Screen-space light AABBs? Generate stereo corrected indices * Using projection matrices? Stereo indexing * Writing out layared offsets or backplane info? Stereo corrected The only thing that really doesn't get stereo corrected is the clustered light list, as they are tightly allocated, completely independent of cluster location.
7 年前 · 1a077742
--- a/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/LightLoop.cs
+++ b/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/LightLoop.cs
            var invProjscrArr = new Matrix4x4[2];
            if (m_FrameSettings.enableStereo)
            {
+                // XRTODO: If possible, we could generate a non-oblique stereo projection
+                // matrix.  It's ok if it's not the exact same matrix, as long as it encompasses
+                // the same FOV as the original projection matrix (which would mean padding each half
+                // of the frustum with the max half-angle). We don't need the light information in 
+                // real projection space.  We just use screen space to figure out what is proximal
+                // to a cluster or tile.
                for (int eyeIndex = 0; eyeIndex < 2; eyeIndex++)
                {
                    projArr[eyeIndex] = CameraProjectionStereoLHS(hdCamera.camera, (Camera.StereoscopicEye)eyeIndex);
--- a/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/lightlistbuild-bigtile.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/lightlistbuild-bigtile.compute
 	uint2 viTilLL = 64*tileIDX;
 	uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) );			// not width and height minus 1 since viTilUR represents the end of the tile corner.

-    // 'Normalized' coordinates of tile
+    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
 	float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
 	float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);

--- a/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/lightlistbuild-clustered.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/HDRP/Lighting/LightLoop/lightlistbuild-clustered.compute
 #include "ShaderBase.hlsl"
 #include "LightLoop.cs.hlsl"
 #include "LightingConvexHullUtils.hlsl"
+#include "LightCullUtils.hlsl"

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 #include "SortingComputeUtils.hlsl"

 CBUFFER_START(UnityLightListClustered)
 int g_iNrVisibLights;
+
+// XRTODO: Stereo-ize these
+
 uint g_isOrthographic;
 int _EnvLightIndexShift;
 int _DecalIndexShift;

 CBUFFER_END

+// ClusteredUtils.hlsl is dependent on the constants declared in UnityLightListClustered :/
+// g_fClustBase
+// g_fNearPlane
+// g_fFarPlane
+// g_iLog2NumClusters
-
+// XRTODO: Reading from these textures and buffers must be stereo-ized
 #ifdef MSAA_ENABLED
 Texture2DMS<float> g_depth_tex : register( t0 );
 #else

 #define NR_THREADS			64

+// XRTODO: Stereo-ize writes to these buffers (except g_LayeredSingleIdxBuffer)
+// XRTODO: Stereo-ize writes to g_logBaseBuffer
 #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
 RWStructuredBuffer<float> g_logBaseBuffer : register( u3 );				// don't support RWBuffer yet in unity
 #endif

 groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
 groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES/2];
-groupshared float4 lightPlanes[4*6];
+groupshared float4 lightPlanes[4*6]; // Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)

 groupshared uint lightOffs;

 groupshared uint lightOffsSph;
 #endif

+// XRTODO: Stereo-ize access to g_mInvScrProjection, pass in eyeIndex
 float GetLinearDepth(float zDptBufSpace)    // 0 is near 1 is far
 {
    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
    //return v4Pres.z / v4Pres.w;
 }

+// XRTODO: Stereo-ize access to g_mScrProjection, pass in eyeIndex
 float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
 {
 	bool isOrthographic = g_isOrthographic!=0;
 	return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
 }

+// XRTODO: Stereo-ize access to g_mScrProjection, pass in eyeIndex
 float GetOnePixDiagWorldDistAtDepthOne()
 {
 	float fSx = g_mScrProjection[0].x;
 }

+// SphericalIntersectionTests and CullByExactEdgeTests are close to the versions
+// in lightlistbuild-bigtile.compute.  But would need more re-factoring than needed
+// right now.
+
+// XRTODO: Stereo-ize these functions with eyeIndex
 #ifdef EXACT_EDGE_TESTS
 int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane);
 #endif

 // returns 1 for intersection and 0 for none

+// XRTODO: Pass in eyeIndex
-
+// XRTODO: Stereo-ize due to access to GetViewPosFromLinDepth
+// We need eyeIndex
+    // If this light's screen space depth bounds intersect this cluster...simple cluster test
+    // TODO: Unify this code with the code in CheckIntersectionBasic...
 	unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
 	bool bIsHit = ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff);
 	if(bIsHit)
 				float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
 				float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
 				float z = (i&4)==0 ? depthAtNearZ : depthAtFarZ;
+                // XRTODO: Stereo-ize this by passing in eyeIndex
+                // Test each corner of the cluster against the light bounding box planes
 				bAllInvisib = bAllInvisib && dot(plane, float4(vP,1.0))>0;
 			}

 	return bIsHit;
 }

+// l is the coarse light index, k is the cluster index
 bool CheckIntersectionBasic(int l, int k)
 {
 	unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
 [numthreads(NR_THREADS, 1, 1)]
 void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
 {
+    // XRTODO: Generate eyeIndex from u3GroupID.z
+
 	uint2 tileIDX = u3GroupID.xy;
 	uint t=threadID;


+    // Screen space coordinates of clustered tile
 	uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
 	uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) );		// not width and height minus 1 since viTilUR represents the end of the tile corner.


 	for(int idx=t; idx<(TILE_SIZE_CLUSTERED*TILE_SIZE_CLUSTERED); idx+=NR_THREADS)
 	{
+        // XRTODO: We need to stereo-ize access to g_depth_tex.  This is the only time we use viTilLL
+        // to generate a screen-space texture coordinate, so we can localize our stereo texture access here.
+        // For double-wide, we need to sample the correct half.  
+        // For instancing/multi-view, the right texture layer. When we add support for that, we need to pass in 
+        // an extra uint2 for layer index.
 		uint2 uPixCrd = min( uint2(viTilLL.x+(idx&(TILE_SIZE_CLUSTERED-1)), viTilLL.y+(idx>>log2TileSize)), uint2(g_screenSize.x-1, g_screenSize.y-1) );
 #ifdef MSAA_ENABLED
 		for(int i=0; i<g_iNumSamplesMSAA; i++)
 #endif
 	}

+    // Max across TG
+    // Why is this a uint? Can't we save floats in shared mem?
 	InterlockedMax(ldsZMax, asuint(dpt_ma) );


 	if(dpt_ma<=0.0) dpt_ma = VIEWPORT_SCALE_Z;		// assume sky pixel
 #endif

+    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
 	float2 vTileLL = float2(viTilLL.x/g_screenSize.x, viTilLL.y/g_screenSize.y);
 	float2 vTileUR = float2(viTilUR.x/g_screenSize.x, viTilUR.y/g_screenSize.y);


+    // XRTODO: Properly stereo-ize access to g_vBigTileLightList
+    // All of this code is localized here, so I don't really have to worry about side-effects further on down
+    // I need to generate NrBigTilesY, so I can generate a per-eye offset/base into g_vBigTileLightList
+    // Since bigTileIdx is used twice (once for count, once for the light list), I should probably fix that one.
+    // Would be worth function-alizing, because this code is shared with FPTL/lightlistbuild.compute
 	int NrBigTilesX = (nrTilesX+((1<<log2BigTileToClustTileRatio)-1))>>log2BigTileToClustTileRatio;
 	const int bigTileIdx = (tileIDX.y>>log2BigTileToClustTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToClustTileRatio);		// map the idx to 64x64 tiles
 	int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
 	for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
 	{
 #endif
+        // XRTODO: Once we have our light index (l), we need to make sure it indexes the 
+        // correct portion of g_vBoundsBuffer. I have that code in GenerateScreenSpaceBoundsIndices
+
+        // TODO: Seems kinda funny that we repeat this exact code here, bigtile, and FPTL...
+
 		const float2 vMi = g_vBoundsBuffer[l].xy;
 		const float2 vMa = g_vBoundsBuffer[l+g_iNrVisibLights].xy;


 	int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);

+    // XRTODO: Stereo-ize by passing in eyeIndex
 #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
 	iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(TILE_SIZE_CLUSTERED/2,TILE_SIZE_CLUSTERED/2), uint2(g_screenSize.x-1, g_screenSize.y-1))) );
 #endif
+    // XRTODO: Stereo-ize access to GetLinearDepth with eyeIndex
-#else
+#else // USE_LEFT_HAND_CAMERA_SPACE
-#else
+#else // ENABLE_DEPTH_TEXTURE_BACKPLANE
+    // XRTODO: Stereo-ize by passing in eyeIndex
+    // TODO: Why not sort on console?
 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 	SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
 #endif
+        // This code is a little tricky. For each light in our coarse list (and the list is associated with the screen tile),
+        // we find a cluster associated with the lights AABB Z value.  For each light, we have a min and max cluster index.
+        // And because the cluster index is a max of 255, we are able to encode 4 cluster indices per 32-bit DWORD.
+        // Therefore, we can encode 2 lights per uint (light 0 min idx, light 0 max idx, light 1...)
+        // Each iteration of the loop goes over two neighboring lights in the coarseList, unfortunate name choice of 'l'
+
+        // TODO: We should write some encode/decode functions to help put cluster indices into the shared mem buffer,
+        // and extract them later.  The code that reads from clusterIdx is hairy.
+
+            // XRTODO: Stereo-ize access to g_vBoundsBuffer, run l0 and l1 through GenerateScreenSpaceBoundsIndices
+            // The logic here is a bit confusing.  We seem to process pairs of lights, and 
+            // It's for the cluster in the tile (the depth layer slice whatever)
+
+            // XRTODO: Stereo-ize GetLinearDepth with eyeIndex
 			const unsigned int clustIdxMi0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0].z), suggestedBase));
 			const unsigned int clustIdxMa0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0+g_iNrVisibLights].z), suggestedBase));
 			const unsigned int clustIdxMi1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1].z), suggestedBase));
 	int i=(int) t;
 	int iSpaceAvail = 0;
 	int iSum = 0;
+    // Each thread in the TG represents a cluster index, and tests all coarse lights against that cluster.
+    // It should be noted that nrClusters can never be greater than the TG size, otherwise, this code doesn't work!
+        // Each thread checks it's respective cluster against all coarse lights for intersection.
+        // At the end, 'iSum' represents the number of lights that intersect this cluster!
+        // We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
+        // want to allocate out of g_LayeredSingleIdxBuffer.
+
+        // All the light lists live in g_vLayeredLightList.  They aren't sorted in any manner, and it's tightly packed.
+        // The allocation can handle the max lights per tile, but it likely won't use all the memory.
+        // g_LayeredSingleIdxBuffer is recording the allocations out of the buffer.  
+        // 'start' indicates the offset into g_vLayeredLightList for the _cluster_ being 
+        // processed by this thread. And the TG is processing the tile. 
+        // 'iSpaceAvail' is how many total lights are in this cluster.
+        // This allocation might be roughly over, because CheckIntersectionBasic is a very basic check.
+
+        // XRTODO: For stereo, we don't have to adjust anything into g_LayeredSingleIdxBuffer. Each thread is processing it's own
+        // cluster, so we just need to make sure there is enough memory allocated in g_vLayeredLightList for two eyes worth of
+        // lists.  The offset we get from start is good enough.  We do have to store the offset into the 
+        // stereo-corrected half of g_LayeredOffset.
 	}

 	// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
+    // TODO: Why is this indexed like this?
+
+        // Process in chunks of 4 lights from the coarse tile list
+
+        // XRTODO: Pass in eyeIndex to FetchPlane, as it looks into g_data (SFiniteLightBound)
+
+        // The first 24 threads in the TG each generate 1 plane equation.  There are
+        // 6 planes per light, and we process 4 lights at a time, hence, 24 threads.
+        // We could do more, but this might be all the LDS that can be spared...
+
 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 		GroupMemoryBarrierWithGroupSync();
 #endif
+            // XRTODO: Stereo-ize CheckIntersection by passing in eyeIndex to GetViewPosFromLinDepth
+                // XRTODO: Stereo-ize index into _LightVolumeData, run coarseList[l] thru GenerateLightCullDataIndex
-			}
+			    
+                // TODO: I think there _might_ be a potential bug here. The way this code seems to work is that
+                // as each light comes in, they are added to the allocated list in g_vLayeredLightList.
+                // As each light is added, the respective category count is incremented, and the raw light index is
+                // altered by subtracting the shift associated with the category.  So these light indices are category
+                // dependent. Since they are category dependent, these indices have to grouped into category sub-lists
+                // inside the cluster's allocation.  But...when the coarseList is generated, there's no guarantee it is sorted
+                // unless it runs through the SORTLIST routine above.
+            }
 		}

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)

+    // We now fill information about each light type into g_LayeredOffset
+    // The offset into g_vLayeredLightList, encoded with the number of lights in the category
+    // The encoded offsets are assuming the lights are sorted by category in the cluster list
+
+    // XRTODO: Stereo-ize this initial 'offs' to jump into the correct half of g_LayeredOffs.
+    // The offsets are organized Category/Cluster/Row/Column.
+    // For stereo, we just add eyeIndex*LIGHTCATEGORY_COUNT*nrClusters*nrTilesX*nrTilesY
 	offs = i*nrTilesX*nrTilesY + tileIDX.y*nrTilesX + tileIDX.x;
 	for(int category=0; category<LIGHTCATEGORY_COUNT; category++)
 	{
 	}

 #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
+    // XRTODO: Stereo-ize this, as this is per-eye set of tiles
+    // Add eyeIndex*nrTilesY*nrTilesX to this index
+// XRTODO: Must be stereo-ized because it fetches from g_data
+// Pass in eyeIndex
+    // XRTODO: Use eyeIndex and coarseList[l] to index into g_data
+    // use GenerateLightCullDataIndex
 	SFiniteLightBound lgtDat = g_data[coarseList[l]];

 	const float3 boxX = lgtDat.boxAxisX.xyz;



-
+// XRTODO: Stereo-ize for a bunch of use cases! Pass in eyeIndex!
+// Needed for GetViewPosFromLinDepth, GetOnePixDiagWorldDistAtDepthOne, and g_data
+    // XRTODO: Stereo-ize access with eyeIndex
 #if USE_LEFT_HAND_CAMERA_SPACE
 	float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
 #else
+    // XRTODO: Stereo-ize access with eyeIndex
+        // XRTODO: Stereo-ize access with eyeIndex and GenerateLightCullDataIndex
+        // Use it on coarseList[l], use result to index g_data
 		SFiniteLightBound lgtDat = g_data[coarseList[l]];

 		if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )

 #ifdef EXACT_EDGE_TESTS

+// XRTODO: Stereo-ize for access to GetViewPosFromLinDepth, use eyeIndex
 float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane)
 {
 	float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
 	return GetViewPosFromLinDepth( float2(x, y), z);
 }

+// XRTODO: Stereo-ize for access to GetTileVertex, use eyeIndex
 void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
 {
 	int iSection = e0>>2;		// section 0 is side edges, section 1 is near edges and section 2 is far edges
 	vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0)));
 }

+// XRTODO: Stereo-ize with eyeIndex, used for indexing _LightVolumeData and g_data, 
+// and with GetFrustEdge and GetTileVertex functions
 int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
 {
 	if(threadID==0) lightOffs2 = 0;
 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 		GroupMemoryBarrierWithGroupSync();
 #endif
+        // XRTODO: stereo-ize index used to access _LightVolumeData (and g_data), use GenerateLightCullDataIndex and eyeIndex
+            // XRTODO: stereo-ize index used to access g_data, use the same index generated above from idxCoarse
 			SFiniteLightBound lgtDat = g_data[idxCoarse];

 			const float3 boxX = lgtDat.boxAxisX.xyz;


 				float3 vP1, vE1;
+                // XRTODO: Stereo-ize to use GetFrustEdge, use eyeIndex
 				GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, fTileFarPlane);

 				// potential separation plane
 				positive=0; negative=0;
 				for(int j=0; j<8; j++)
 				{
+                    // XRTODO: Stereo-ize to use GetTileVertex, use eyeIndex
 					float3 vPf = GetTileVertex(viTilLL, viTilUR, j, fTileFarPlane);
 					float fSignDist = dot(vN, vPf-vP0);
 					if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;