remove local atomics from FPTL for Mobile

8 年前 · ac3a3376
--- a/Assets/ScriptableRenderPipeline/fptl/lightlistbuild.compute
+++ b/Assets/ScriptableRenderPipeline/fptl/lightlistbuild.compute
 #include "SortingComputeUtils.hlsl"
 #endif

+#define NARROW_MOBILE_ENABLED
+
+#ifdef NARROW_MOBILE_ENABLED
+	#define EMUL_LOCAL_ATOMICS
+#endif
+

 #define FINE_PRUNING_ENABLED
 #define PERFORM_SPHERICAL_INTERSECTION_TESTS
 StructuredBuffer<uint> g_vBigTileLightList : register( t4 );		// don't support Buffer yet in unity
 #endif

-#define NR_THREADS			64
+#ifdef NARROW_MOBILE_ENABLED
+	#define NR_THREADS			32
+#else
+	#define NR_THREADS			64
+#endif
+
+#include "LocalAtomics.hlsl"

 // output buffer
 RWStructuredBuffer<uint> g_vLightList : register( u0 );				// don't support RWBuffer yet in unity
 #endif

 #ifdef FINE_PRUNING_ENABLED
+
+#ifndef NARROW_MOBILE_ENABLED
+#else
+void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths1, float4 vLinDepths2);
+#endif
+
 #endif


 	uint2 tileIDX = u3GroupID.xy;
 	uint t=threadID;

+#ifndef NARROW_MOBILE_ENABLED
+#else
+	for(int i=(int) t; i<MAX_NR_COARSE_ENTRIES; i+=NR_THREADS)
+		prunedList[i]=0;
+#endif
 	
 	uint iWidth = g_viDimensions.x;
 	uint iHeight = g_viDimensions.y;


 	float4 vLinDepths;
+#ifdef NARROW_MOBILE_ENABLED
+	float4 vLinDepths2;
+#endif
+#ifndef NARROW_MOBILE_ENABLED
+#else
+		for(int i = 0; i < 8; i++)
+#endif
-			vLinDepths[i] = GetLinearDepth(fDepth);
+			const float linDepth = GetLinearDepth(fDepth);
+#ifndef NARROW_MOBILE_ENABLED
+			vLinDepths[i] = linDepth;
+#else
+			if(i<4) vLinDepths[i] = linDepth;
+			else vLinDepths2[i-4] = linDepth;
+#endif
 			if(fDepth<VIEWPORT_SCALE_Z)		// if not skydome
 			{
 				dpt_mi = min(fDepth, dpt_mi);

-		InterlockedMax(ldsZMax, asuint(dpt_ma));
-		InterlockedMin(ldsZMin, asuint(dpt_mi));
+		InterlockedMAX(ldsZMax, asuint(dpt_ma), threadID);
+		InterlockedMIN(ldsZMin, asuint(dpt_mi), threadID);


 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 	int NrBigTilesX = (nrTilesX+3)>>2;
 	const int bigTileIdx = (tileIDX.y>>2)*NrBigTilesX + (tileIDX.x>>2);		// map the idx to 64x64 tiles
 	int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+0];
+	int nrLightsIn = nrBigTileLights;
+	int nrLightsIn = (int) g_iNrVisibLights;
-
+#ifndef EMUL_LOCAL_ATOMICS
 		if( all(vMa>vTileLL) && all(vMi<vTileUR))
 		{
 			unsigned int uInc = 1;
 		}
+#else
+		unsigned int uInc = (all(vMa>vTileLL) && all(vMi<vTileUR)) ? 1 : 0;
+		unsigned int uIndex;
+		InterlockedADDAndPrev(lightOffs, uInc, uIndex, t, l, nrLightsIn);
+		if(uIndex<MAX_NR_COARSE_ENTRIES && uInc!=0) coarseList[uIndex] = l;		// add to light list
+#endif
 	}

 #ifdef FINE_PRUNING_ENABLED	

 #ifndef FINE_PRUNING_ENABLED	
 	{
+#ifndef NARROW_MOBILE_ENABLED
+#else
+		for(int i=(int) t; t<iNrCoarseLights; i+=NR_THREADS) prunedList[i] = coarseList[i];
+#endif
 		if(t==0) ldsNrLightsFinal=iNrCoarseLights;
 	}
 #else
+#ifndef NARROW_MOBILE_ENABLED
+#else
+		FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths, vLinDepths2);
+#endif
 	}
 #endif

 	int nrLightsCombinedList = min(ldsNrLightsFinal,MAX_NR_COARSE_ENTRIES);
 	for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS) 
 	{
+#ifndef EMUL_LOCAL_ATOMICS
+#else
+		uint model = g_vLightData[ prunedList[i] ].lightModel;
+		for(int m=0; m<NR_LIGHT_MODELS; m++)
+		{
+			uint uInc = model==m ? 1 : 0;
+			InterlockedADD(ldsModelListCount[m], uInc, threadID, i, nrLightsCombinedList);
+		}
+#endif
-#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
+#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(NARROW_MOBILE_ENABLED)
 	SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
 	//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
 #endif
 	for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
 	{
 		SFiniteLightBound lightData = g_data[prunedList[l]];
-	
-		if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius) )
+
+		bool bHit = DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius);
+#ifndef EMUL_LOCAL_ATOMICS	
+		if( bHit )
+#else
+		unsigned int uInc = bHit ? 1 : 0;
+		unsigned int uIndex;
+		InterlockedADDAndPrev(lightOffsSph, uInc, uIndex, threadID, l, iNrCoarseLights);
+		if(bHit) coarseList[uIndex]=prunedList[l];		// read from the original copy of coarseList which is backed up in prunedList
+#endif
 	}

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 #ifdef FINE_PRUNING_ENABLED
 // initializes ldsNrLightsFinal with the number of accepted lights.
 // all accepted entries delivered in prunedList[].
+
+#ifndef NARROW_MOBILE_ENABLED
+#else
+void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths1, float4 vLinDepths2)
+#endif
+#ifndef NARROW_MOBILE_ENABLED
+	const int numPixSerial = 4;
+#else
+	const int numPixSerial = 8;
+#endif
 	uint t = threadID;
 	uint iWidth = g_viDimensions.x;
 	uint iHeight = g_viDimensions.y;
 				
 			// serially check 4 pixels
 			uint uVal = 0;
-			for(int i=0; i<4; i++)
+			for(int i=0; i<numPixSerial; i++)
-				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
+#ifdef NARROW_MOBILE_ENABLED
+				float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
+#endif
+				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);
 	
 				// check pixel
 				float3 fromLight = vVPos-lightData.lightPos.xyz;

 			// serially check 4 pixels
 			uint uVal = 0;
-			for(int i=0; i<4; i++)
+			for(int i=0; i<numPixSerial; i++)
-				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
+#ifdef NARROW_MOBILE_ENABLED
+				float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
+#endif
+				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);
 	
 				// check pixel
 				float3 vLp = lightData.lightPos.xyz;

 			// serially check 4 pixels
 			uint uVal = 0;
-			for(int i=0; i<4; i++)
+			for(int i=0; i<numPixSerial; i++)
-				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
+#ifdef NARROW_MOBILE_ENABLED
+				float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
+#endif
+				float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);

 				// check pixel
 				float3 toLight  = lightData.lightPos.xyz - vVPos;
 		if(uLgtType>=MAX_TYPES) ++l;
 	}

-	InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
-	InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
+	InterlockedOR(ldsDoesLightIntersect[0], uLightsFlags[0], threadID);
+	InterlockedOR(ldsDoesLightIntersect[1], uLightsFlags[1], threadID);
 	if(t==0) ldsNrLightsFinal = 0;

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
+#ifndef NARROW_MOBILE_ENABLED
 	if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )
 	{
 		unsigned int uInc = 1;
 	}
+#else
+	for(uint i=t; i<MAX_NR_COARSE_ENTRIES; i+=NR_THREADS)
+	{
+		unsigned int uInc = (i<(uint) iNrCoarseLights && (ldsDoesLightIntersect[i<32 ? 0 : 1]&(1<<(i&31)))!=0) ? 1 : 0;
+		unsigned int uIndex;
+		InterlockedADDAndPrev(ldsNrLightsFinal, uInc, uIndex, t, i, MAX_NR_COARSE_ENTRIES);
+		if(uInc!=0 && uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[i];		// we allow up to 64 pruned lights while stored in LDS.
+	}
+#endif
 }
 #endif
--- a/Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl
+++ b/Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl
+#ifndef __LOCALATOMICS_H__
+#define __LOCALATOMICS_H__
+
+
+
+#ifdef EMUL_LOCAL_ATOMICS
+
+
+groupshared unsigned int tgbuffer[NR_THREADS];
+
+#define InterlockedOR(data, val, threadid)	\
+{	\
+	{	\
+	tgbuffer[threadid] = (uint) val;	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0)	\
+	{	\
+		tgbuffer[threadid+0] |= (tgbuffer[threadid+1] | tgbuffer[threadid+2] | tgbuffer[threadid+3]);	\
+	}	\
+	GroupMemoryBarrier();	\
+	if(threadid==0)	\
+	{	\
+		uint result = tgbuffer[0];	\
+		for(int i=4; i<NR_THREADS; i+=0x4)  { result |= tgbuffer[i]; }	\
+		data |= result; \
+	}	\
+	GroupMemoryBarrier();	\
+	}	\
+}
+
+#define InterlockedMAX(data, val, threadid)	\
+{	\
+	{	\
+	tgbuffer[threadid] = (uint) val;	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0)	\
+	{	\
+		tgbuffer[threadid+0] = max( max(tgbuffer[threadid+0], tgbuffer[threadid+1]), max(tgbuffer[threadid+2],tgbuffer[threadid+3]) );	\
+	}	\
+	GroupMemoryBarrier();	\
+	if(threadid==0)	\
+	{	\
+		uint result = tgbuffer[0];	\
+		for(int i=4; i<NR_THREADS; i+=0x4)  { result = max(result, tgbuffer[i]); }	\
+		data = max(data,result); \
+	}	\
+	GroupMemoryBarrier();	\
+	}	\
+}
+
+#define InterlockedMIN(data, val, threadid)	\
+{	\
+	{	\
+	tgbuffer[threadid] = (uint) val;	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0)	\
+	{	\
+		tgbuffer[threadid+0] = min( min(tgbuffer[threadid+0], tgbuffer[threadid+1]), min(tgbuffer[threadid+2],tgbuffer[threadid+3]) );	\
+	}	\
+	GroupMemoryBarrier();	\
+	if(threadid==0)	\
+	{	\
+		uint result = tgbuffer[0];	\
+		for(int i=4; i<NR_THREADS; i+=0x4)  { result = min(result, tgbuffer[i]); }	\
+		data = min(data,result); \
+	}	\
+	GroupMemoryBarrier();	\
+	}	\
+}
+
+#define InterlockedADD(data, val, threadid, idx, nrIterations)	\
+{	\
+	{	\
+	const int nrActiveThreads = min(NR_THREADS, nrIterations-(idx&(~(NR_THREADS-1)))); \
+	tgbuffer[threadid] = (uint) val;	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0)	\
+	{	\
+		uint val1 = (threadid+1)<nrActiveThreads ? tgbuffer[threadid+1] : 0;	\
+		uint val2 = (threadid+2)<nrActiveThreads ? tgbuffer[threadid+2] : 0;	\
+		uint val3 = (threadid+3)<nrActiveThreads ? tgbuffer[threadid+3] : 0;	\
+		tgbuffer[threadid+0] += (val1+val2+val3);	\
+	}	\
+	GroupMemoryBarrier();	\
+	if(threadid==0)	\
+	{	\
+		uint result = tgbuffer[0];	\
+		for(int i=4; i<NR_THREADS; i+=0x4)	\
+		{	\
+			result += (i<nrActiveThreads ? tgbuffer[i] : 0);	\
+		}	\
+		data += result; \
+	}	\
+	GroupMemoryBarrier();	\
+	}	\
+}
+
+
+
+
+
+#define InterlockedADDAndPrev(data, val, prevval, threadid, idx, nrIterations)	\
+{	\
+	{	\
+	const int nrActiveThreads = min(NR_THREADS, nrIterations-(idx&(~(NR_THREADS-1)))); \
+	tgbuffer[threadid] = (uint) val;	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0)	\
+	{	\
+		for(int i=1; i<4; i++) tgbuffer[threadid+i] += tgbuffer[threadid+i-1];	\
+	}	\
+	GroupMemoryBarrier();	\
+	if(threadid==0)	\
+	{	\
+		for(int i=0x7; i<NR_THREADS; i+=0x4) tgbuffer[i] += tgbuffer[i-0x4];	\
+	}	\
+	GroupMemoryBarrier();	\
+	uint prevblock = tgbuffer[max(1,threadid)-1];	\
+	GroupMemoryBarrier();	\
+	if((threadid&0x3)==0 && threadid>0)	\
+	{	\
+		for(int i=0; i<3; i++) tgbuffer[threadid+i] += prevblock;	\
+	}	\
+	uint orgdata = data;	\
+	GroupMemoryBarrier();	\
+	prevval = (orgdata + tgbuffer[threadid]) - val;	\
+	if(threadid==0) data = orgdata + tgbuffer[nrActiveThreads-1];	\
+	GroupMemoryBarrier();	\
+	}	\
+}
+
+#else
+
+#define InterlockedOR(data, val, threadid)	InterlockedOr(data, val)
+#define InterlockedMAX(data, val, threadid)	InterlockedMax(data, val)
+#define InterlockedMIN(data, val, threadid)	InterlockedMin(data, val)
+#define InterlockedADD(data, val, threadid, idx, nrIterations)	InterlockedAdd(data, val)
+#define InterlockedADDAndPrev(data, val, prevval, threadid, idx, nrIterations)	InterlockedAdd(data, val, prevval)
+
+
+#endif
+
+
+
+
+
+#endif
--- a/Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl.meta
+++ b/Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl.meta
+fileFormatVersion: 2
+guid: 02dfbc89d64584ef59e83f293ae7ee49
+timeCreated: 1489193687
+licenseType: Pro
+ShaderImporter:
+  defaultTextures: []
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: