separate out sort routines

8 年前 · 4e597199
--- a/Assets/ScriptableRenderLoop/fptl/lightlistbuild-bigtile.compute
+++ b/Assets/ScriptableRenderLoop/fptl/lightlistbuild-bigtile.compute
 #include "LightDefinitions.cs.hlsl"

 #include "LightingConvexHullUtils.hlsl"
+#include "SortingComputeUtils.hlsl"

 #define EXACT_EDGE_TESTS
 #define PERFORM_SPHERICAL_INTERSECTION_TESTS


 	// sort lights
-	sortLightList((int) t, iNrCoarseLights);	// sorting on GCN as well for now since we need the culled lights weeded out.
-
+	SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIGTILE_LIGHTS_PLUSONE, t, NR_THREADS);

 	lightOffs = 0;
 	GroupMemoryBarrierWithGroupSync();
 		g_vLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*offs + i] = t==0 ? iNrCoarseLights : lightsListLDS[i-1];
 }

-
-
-unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
-{
-#if 0
-	unsigned int value = 1;
-	
-	while(value<value_in && (value<<1)<=maxValue)
-		value<<=1;
-
-	return value_in==0 ? 0 : value;
-#else
-	uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));		// firstbithigh(0) returns -1
-	return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));	// max() just in case of overflow
-#endif
-}
-
-
-void sortLightList(int localThreadID, int length)
-{
-	// closest pow2 integer greater than or equal to length
-	const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_BIGTILE_LIGHTS_PLUSONE);			// N is 1 when length is zero but will still not enter first for-loop
-
-	// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
-	for(int t=length+localThreadID; t<N; t+=NR_THREADS) { lightsListLDS[t]=0xffffffff; }		// impossible index
-	GroupMemoryBarrierWithGroupSync();
-
-	for(int k=2; k<=N; k=2*k)
-	{
-		for(int j=k>>1; j>0; j=j>>1)
-		{
-			for(int i=localThreadID; i<N; i+=NR_THREADS)
-			{
-				int ixj=i^j;
-				if((ixj)>i)
-				{
-					const unsigned int Avalue = lightsListLDS[i];
-					const unsigned int Bvalue = lightsListLDS[ixj];
-
-					const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
-					if(mustSwap)
-					{
-						lightsListLDS[i]=Bvalue;
-						lightsListLDS[ixj]=Avalue;
-					}
-				}
-			}
-
-			GroupMemoryBarrierWithGroupSync();
-		}
-	}
-}

 #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
 void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
--- a/Assets/ScriptableRenderLoop/fptl/lightlistbuild-clustered.compute
+++ b/Assets/ScriptableRenderLoop/fptl/lightlistbuild-clustered.compute

 #include "LightingConvexHullUtils.hlsl"

+#if !defined(XBONE) && !defined(PLAYSTATION4)
+#include "SortingComputeUtils.hlsl"
+#endif
+
 //#define EXACT_EDGE_TESTS
 #define PERFORM_SPHERICAL_INTERSECTION_TESTS
 #define CONV_HULL_TEST_ENABLED

 	return length( float2(1.0/fSx,1.0/fSy) );
 }
-
-void sortLightList(int localThreadID, int n);

 #ifdef EXACT_EDGE_TESTS
 int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane);

 // sort lights
 #if !defined(XBONE) && !defined(PLAYSTATION4)
-	sortLightList((int) t, iNrCoarseLights);
+	SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
 #endif

 	//////////// cell specific code
 	g_logBaseBuffer[tileIDX.y*nrTilesX + tileIDX.x] = suggestedBase;
 #endif
 }
-
-
-unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
-{
-#if 0
-	unsigned int value = 1;
-	
-	while(value<value_in && (value<<1)<=maxValue)
-		value<<=1;
-
-	return value_in==0 ? 0 : value;
-#else
-	uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));		// firstbithigh(0) returns -1
-	return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));	// max() just in case of overflow
-#endif
-}
-
-
-void sortLightList(int localThreadID, int length)
-{
-	// closest pow2 integer greater than or equal to length
-	const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES);			// N is 1 when length is zero but will still not enter first for-loop
-
-	// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
-	for(int t=length+localThreadID; t<N; t+=NR_THREADS) { coarseList[t]=0xffffffff; }		// impossible index
-	GroupMemoryBarrierWithGroupSync();
-
-	for(int k=2; k<=N; k=2*k)
-	{
-		for(int j=k>>1; j>0; j=j>>1)
-		{
-			for(int i=localThreadID; i<N; i+=NR_THREADS)
-			{
-				int ixj=i^j;
-				if((ixj)>i)
-				{
-					const unsigned int Avalue = coarseList[i];
-					const unsigned int Bvalue = coarseList[ixj];
-
-					const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
-					if(mustSwap)
-					{
-						coarseList[i]=Bvalue;
-						coarseList[ixj]=Avalue;
-					}
-				}
-			}
-
-			GroupMemoryBarrierWithGroupSync();
-		}
-	}
-}
-
-


 float4 FetchPlane(int l, int p)
--- a/Assets/ScriptableRenderLoop/fptl/lightlistbuild.compute
+++ b/Assets/ScriptableRenderLoop/fptl/lightlistbuild.compute

 #include "LightingConvexHullUtils.hlsl"

+#if !defined(XBONE) && !defined(PLAYSTATION4)
+#include "SortingComputeUtils.hlsl"
+#endif
+

 #define FINE_PRUNING_ENABLED
 #define PERFORM_SPHERICAL_INTERSECTION_TESTS

 	return length( float2(1.0/fSx,1.0/fSy) );
 }
-
-void sortLightList(int localThreadID, int n);

 #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
 int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);

 	// sort lights
 #if !defined(XBONE) && !defined(PLAYSTATION4)
-	sortLightList((int) t, nrLightsCombinedList);
+	SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
+	//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
 #endif

 	// write lights to global buffers

 }

-
-// original version
-//float2 vRay2D = float2(max(V.x,V.y), fSclProj);
-//float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D);
-//if( all( float3(lightData.radiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lightData.fPenumbra, 0.0) ) ) uVal = 1;
-
-
-
-// previous new version
-//float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC);
-//if( all( float3(lightData.radiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1;
-
-#if 0
-void merge(int l, int m, int r);
-
-void sortLightList(int localThreadID, int n)
-{
-   for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)
-   {
-		for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size))
-		{
-			int mid = left_start + curr_size - 1;
-			int right_end = min(left_start + 2*curr_size - 1, n-1);
-			merge(left_start, mid, right_end);
-		}
-
-	   GroupMemoryBarrierWithGroupSync();
-   }
-}
-
-//groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES];
-
-void merge(int l, int m, int r)
-{
-    int i, j, k;
-    
-	int ol = l;		
-	int or = m+1;	
-	int sl = m - l + 1;		// capacity is size of left list = m - l + 1;
-    int sr =  r - m;		// capacity is size of right list = r - m
-
-	unsigned int tmpBuffer[] = coarseList;		// re use coarse list buffer as temp buffer.
-
-	// could do this copy more efficiently before the if-statement
-	// in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync()
-	for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i];
- 
-    i = 0;
-    j = 0;
-    k = l;
-    while (i < sl && j < sr)
-    {
-		const uint lVal = tmpBuffer[ol+i];
-		const uint rVal = tmpBuffer[or+j];
-		bool pickLeft = lVal <= rVal;
-		i = pickLeft ? (i+1) : i;
-		j = pickLeft ? j : (j+1);
-		prunedList[k] = pickLeft ? lVal : rVal;
-        k++;
-    }
- 
-    while (i < sl)
-    {
-        prunedList[k] = tmpBuffer[ol+i];
-        i++; k++;
-    }
- 
-    while (j < sr)
-    {
-        prunedList[k] = tmpBuffer[or+j];
-        j++; k++;
-    }
-}
- 
-#else
-
-unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
-{
-#if 0
-	unsigned int value = 1;
-	
-	while(value<value_in && (value<<1)<=maxValue)
-		value<<=1;
-
-	return value_in==0 ? 0 : value;
-#else
-	uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));		// firstbithigh(0) returns -1
-	return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));	// max() just in case of overflow
-#endif
-}
-
-
-void sortLightList(int localThreadID, int length)
-{
-	// closest pow2 integer greater than or equal to length
-	const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES);			// N is 1 when length is zero but will still not enter first for-loop
-
-	// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
-	for(int t=length+localThreadID; t<N; t+=NR_THREADS) { prunedList[t]=0xffffffff; }		// impossible index
-	GroupMemoryBarrierWithGroupSync();
-
-	for(int k=2; k<=N; k=2*k)
-	{
-		for(int j=k>>1; j>0; j=j>>1)
-		{
-			for(int i=localThreadID; i<N; i+=NR_THREADS)
-			{
-				int ixj=i^j;
-				if((ixj)>i)
-				{
-					const unsigned int Avalue = prunedList[i];
-					const unsigned int Bvalue = prunedList[ixj];
-
-					const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
-					if(mustSwap)
-					{
-						prunedList[i]=Bvalue;
-						prunedList[ixj]=Avalue;
-					}
-				}
-			}
-
-			GroupMemoryBarrierWithGroupSync();
-		}
-	}
-}
-
-#endif


 #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
--- a/Assets/ScriptableRenderLoop/fptl/SortingComputeUtils.hlsl
+++ b/Assets/ScriptableRenderLoop/fptl/SortingComputeUtils.hlsl
+#ifndef __SORTINGCOMPUTEUTILS_H__
+#define __SORTINGCOMPUTEUTILS_H__
+
+unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
+{
+#if 0
+	unsigned int value = 1;
+	
+	while(value<value_in && (value<<1)<=maxValue)
+		value<<=1;
+
+	return value_in==0 ? 0 : value;
+#else
+	uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));		// firstbithigh(0) returns -1
+	valpw2 = max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));	// max() just in case of overflow
+	return min(valpw2, maxValue);
+#endif
+}
+
+// have to make this sort routine a macro unfortunately because hlsl doesn't take
+// groupshared memory of unspecified length as an input parameter to a function.
+// maxcapacity_in must be a power of two.
+// all data from length_in and up to closest power of two will be filled with 0xffffffff
+#define SORTLIST(data, length_in, maxcapacity_in, localThreadID_in, nrthreads_in)	\
+{	\
+	int length=(int) length_in, maxcapacity=(int) maxcapacity_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in;	\
+																							\
+	const int N = (const int) LimitPow2AndClamp((unsigned int) length, (uint) maxcapacity);	\
+	for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=0xffffffff; }				\
+	GroupMemoryBarrierWithGroupSync();														\
+																							\
+	for(int k=2; k<=N; k=2*k)																\
+	{																						\
+		for(int j=k>>1; j>0; j=j>>1)														\
+		{																					\
+			for(int i=localThreadID; i<N; i+=nrthreads)										\
+			{																				\
+				int ixj=i^j;																\
+				if((ixj)>i)																	\
+				{																			\
+					const unsigned int Avalue = data[i];									\
+					const unsigned int Bvalue = data[ixj];									\
+																							\
+					const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;		\
+					if(mustSwap)						\
+					{									\
+						data[i]=Bvalue;					\
+						data[ixj]=Avalue;				\
+					}					\
+				}						\
+			}							\
+										\
+			GroupMemoryBarrierWithGroupSync();		\
+		}		\
+	}		\
+}
+
+// have to make this sort routine a macro unfortunately because hlsl doesn't take
+// groupshared memory of unspecified length as an input parameter to a function.
+// merge-sort is not in-place so two buffers are required: data and tmpdata.
+// These must both have a capacity of at least length_in entries and initial
+// input is assumed to be in data and results will be delivered in data.
+#define MERGESORTLIST(data, tmpdata, length_in, localThreadID_in, nrthreads_in)	\
+{			\
+	int length=(int) length_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in;	\
+																										\
+	for(int curr_size=1; curr_size<=length-1; curr_size = 2*curr_size)													\
+	{																												\
+		for(int left_start=localThreadID*(2*curr_size); left_start<(length-1); left_start+=nrthreads*(2*curr_size))		\
+		{																											\
+			int mid = left_start + curr_size - 1;																	\
+			int right_end = min(left_start + 2*curr_size - 1, length-1);											\
+			{																										\
+				int l=left_start, m=mid, r=right_end;																\
+																													\
+				int i, j, k;																						\
+																													\
+				int ol = l;																							\
+				int or = m+1;																						\
+				int sl = m - l + 1;																					\
+				int sr =  r - m;																					\
+																													\
+				for(int i=l; i<=r; i++) tmpdata[i] = data[i];														\
+																													\
+				i = 0; j = 0; k = l;																				\
+				while (i < sl && j < sr)																			\
+				{																									\
+					const uint lVal = tmpdata[ol+i];																\
+					const uint rVal = tmpdata[or+j];																\
+					bool pickLeft = lVal <= rVal;																	\
+					i = pickLeft ? (i+1) : i;																		\
+					j = pickLeft ? j : (j+1);																		\
+					data[k] = pickLeft ? lVal : rVal;																\
+					k++;																							\
+				}																									\
+																													\
+				while (i < sl)																						\
+				{																									\
+					data[k] = tmpdata[ol+i];																		\
+					i++; k++;																						\
+				}																									\
+																													\
+				while (j < sr)																						\
+				{																									\
+					data[k] = tmpdata[or+j];																		\
+					j++; k++;																						\
+				}																									\
+			}																										\
+		}																											\
+																													\
+	   GroupMemoryBarrierWithGroupSync();																			\
+   }																												\
+}
+
+
+
+#endif