浏览代码

remove local atomics from FPTL for Mobile

/main
Filip Iliescu 8 年前
当前提交
ac3a3376
共有 3 个文件被更改,包括 270 次插入16 次删除
  1. 130
      Assets/ScriptableRenderPipeline/fptl/lightlistbuild.compute
  2. 147
      Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl
  3. 9
      Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl.meta

130
Assets/ScriptableRenderPipeline/fptl/lightlistbuild.compute


#include "SortingComputeUtils.hlsl"
#endif
#define NARROW_MOBILE_ENABLED
#ifdef NARROW_MOBILE_ENABLED
#define EMUL_LOCAL_ATOMICS
#endif
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

StructuredBuffer<uint> g_vBigTileLightList : register( t4 ); // don't support Buffer yet in unity
#endif
#define NR_THREADS 64
#ifdef NARROW_MOBILE_ENABLED
#define NR_THREADS 32
#else
#define NR_THREADS 64
#endif
#include "LocalAtomics.hlsl"
// output buffer
RWStructuredBuffer<uint> g_vLightList : register( u0 ); // don't support RWBuffer yet in unity

#endif
#ifdef FINE_PRUNING_ENABLED
#ifndef NARROW_MOBILE_ENABLED
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths1, float4 vLinDepths2);
#endif
#endif

uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
#ifndef NARROW_MOBILE_ENABLED
#else
for(int i=(int) t; i<MAX_NR_COARSE_ENTRIES; i+=NR_THREADS)
prunedList[i]=0;
#endif
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;

float4 vLinDepths;
#ifdef NARROW_MOBILE_ENABLED
float4 vLinDepths2;
#endif
#ifndef NARROW_MOBILE_ENABLED
#else
for(int i = 0; i < 8; i++)
#endif
vLinDepths[i] = GetLinearDepth(fDepth);
const float linDepth = GetLinearDepth(fDepth);
#ifndef NARROW_MOBILE_ENABLED
vLinDepths[i] = linDepth;
#else
if(i<4) vLinDepths[i] = linDepth;
else vLinDepths2[i-4] = linDepth;
#endif
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);

InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
InterlockedMAX(ldsZMax, asuint(dpt_ma), threadID);
InterlockedMIN(ldsZMin, asuint(dpt_mi), threadID);
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)

int NrBigTilesX = (nrTilesX+3)>>2;
const int bigTileIdx = (tileIDX.y>>2)*NrBigTilesX + (tileIDX.x>>2); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+0];
int nrLightsIn = nrBigTileLights;
int nrLightsIn = (int) g_iNrVisibLights;
#ifndef EMUL_LOCAL_ATOMICS
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
unsigned int uInc = 1;

}
#else
unsigned int uInc = (all(vMa>vTileLL) && all(vMi<vTileUR)) ? 1 : 0;
unsigned int uIndex;
InterlockedADDAndPrev(lightOffs, uInc, uIndex, t, l, nrLightsIn);
if(uIndex<MAX_NR_COARSE_ENTRIES && uInc!=0) coarseList[uIndex] = l; // add to light list
#endif
}
#ifdef FINE_PRUNING_ENABLED

#ifndef FINE_PRUNING_ENABLED
{
#ifndef NARROW_MOBILE_ENABLED
#else
for(int i=(int) t; t<iNrCoarseLights; i+=NR_THREADS) prunedList[i] = coarseList[i];
#endif
if(t==0) ldsNrLightsFinal=iNrCoarseLights;
}
#else

#ifndef NARROW_MOBILE_ENABLED
#else
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths, vLinDepths2);
#endif
}
#endif

int nrLightsCombinedList = min(ldsNrLightsFinal,MAX_NR_COARSE_ENTRIES);
for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)
{
#ifndef EMUL_LOCAL_ATOMICS
#else
uint model = g_vLightData[ prunedList[i] ].lightModel;
for(int m=0; m<NR_LIGHT_MODELS; m++)
{
uint uInc = model==m ? 1 : 0;
InterlockedADD(ldsModelListCount[m], uInc, threadID, i, nrLightsCombinedList);
}
#endif
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL) && !defined(NARROW_MOBILE_ENABLED)
SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif

for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
{
SFiniteLightBound lightData = g_data[prunedList[l]];
if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius) )
bool bHit = DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius);
#ifndef EMUL_LOCAL_ATOMICS
if( bHit )
#else
unsigned int uInc = bHit ? 1 : 0;
unsigned int uIndex;
InterlockedADDAndPrev(lightOffsSph, uInc, uIndex, threadID, l, iNrCoarseLights);
if(bHit) coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
#endif
}
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)

#ifdef FINE_PRUNING_ENABLED
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#ifndef NARROW_MOBILE_ENABLED
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths1, float4 vLinDepths2)
#endif
#ifndef NARROW_MOBILE_ENABLED
const int numPixSerial = 4;
#else
const int numPixSerial = 8;
#endif
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;

// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
for(int i=0; i<numPixSerial; i++)
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
#ifdef NARROW_MOBILE_ENABLED
float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
#endif
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);
// check pixel
float3 fromLight = vVPos-lightData.lightPos.xyz;

// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
for(int i=0; i<numPixSerial; i++)
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
#ifdef NARROW_MOBILE_ENABLED
float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
#endif
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);
// check pixel
float3 vLp = lightData.lightPos.xyz;

// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
for(int i=0; i<numPixSerial; i++)
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
#ifdef NARROW_MOBILE_ENABLED
float4 vLinDepths = i<4 ? vLinDepths1 : vLinDepths2;
#endif
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i&0x3]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;

if(uLgtType>=MAX_TYPES) ++l;
}
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
InterlockedOR(ldsDoesLightIntersect[0], uLightsFlags[0], threadID);
InterlockedOR(ldsDoesLightIntersect[1], uLightsFlags[1], threadID);
if(t==0) ldsNrLightsFinal = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)

#ifndef NARROW_MOBILE_ENABLED
if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )
{
unsigned int uInc = 1;

}
#else
for(uint i=t; i<MAX_NR_COARSE_ENTRIES; i+=NR_THREADS)
{
unsigned int uInc = (i<(uint) iNrCoarseLights && (ldsDoesLightIntersect[i<32 ? 0 : 1]&(1<<(i&31)))!=0) ? 1 : 0;
unsigned int uIndex;
InterlockedADDAndPrev(ldsNrLightsFinal, uInc, uIndex, t, i, MAX_NR_COARSE_ENTRIES);
if(uInc!=0 && uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[i]; // we allow up to 64 pruned lights while stored in LDS.
}
#endif
}
#endif

147
Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl


#ifndef __LOCALATOMICS_H__
#define __LOCALATOMICS_H__
#ifdef EMUL_LOCAL_ATOMICS
groupshared unsigned int tgbuffer[NR_THREADS];
#define InterlockedOR(data, val, threadid) \
{ \
{ \
tgbuffer[threadid] = (uint) val; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0) \
{ \
tgbuffer[threadid+0] |= (tgbuffer[threadid+1] | tgbuffer[threadid+2] | tgbuffer[threadid+3]); \
} \
GroupMemoryBarrier(); \
if(threadid==0) \
{ \
uint result = tgbuffer[0]; \
for(int i=4; i<NR_THREADS; i+=0x4) { result |= tgbuffer[i]; } \
data |= result; \
} \
GroupMemoryBarrier(); \
} \
}
#define InterlockedMAX(data, val, threadid) \
{ \
{ \
tgbuffer[threadid] = (uint) val; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0) \
{ \
tgbuffer[threadid+0] = max( max(tgbuffer[threadid+0], tgbuffer[threadid+1]), max(tgbuffer[threadid+2],tgbuffer[threadid+3]) ); \
} \
GroupMemoryBarrier(); \
if(threadid==0) \
{ \
uint result = tgbuffer[0]; \
for(int i=4; i<NR_THREADS; i+=0x4) { result = max(result, tgbuffer[i]); } \
data = max(data,result); \
} \
GroupMemoryBarrier(); \
} \
}
#define InterlockedMIN(data, val, threadid) \
{ \
{ \
tgbuffer[threadid] = (uint) val; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0) \
{ \
tgbuffer[threadid+0] = min( min(tgbuffer[threadid+0], tgbuffer[threadid+1]), min(tgbuffer[threadid+2],tgbuffer[threadid+3]) ); \
} \
GroupMemoryBarrier(); \
if(threadid==0) \
{ \
uint result = tgbuffer[0]; \
for(int i=4; i<NR_THREADS; i+=0x4) { result = min(result, tgbuffer[i]); } \
data = min(data,result); \
} \
GroupMemoryBarrier(); \
} \
}
#define InterlockedADD(data, val, threadid, idx, nrIterations) \
{ \
{ \
const int nrActiveThreads = min(NR_THREADS, nrIterations-(idx&(~(NR_THREADS-1)))); \
tgbuffer[threadid] = (uint) val; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0) \
{ \
uint val1 = (threadid+1)<nrActiveThreads ? tgbuffer[threadid+1] : 0; \
uint val2 = (threadid+2)<nrActiveThreads ? tgbuffer[threadid+2] : 0; \
uint val3 = (threadid+3)<nrActiveThreads ? tgbuffer[threadid+3] : 0; \
tgbuffer[threadid+0] += (val1+val2+val3); \
} \
GroupMemoryBarrier(); \
if(threadid==0) \
{ \
uint result = tgbuffer[0]; \
for(int i=4; i<NR_THREADS; i+=0x4) \
{ \
result += (i<nrActiveThreads ? tgbuffer[i] : 0); \
} \
data += result; \
} \
GroupMemoryBarrier(); \
} \
}
#define InterlockedADDAndPrev(data, val, prevval, threadid, idx, nrIterations) \
{ \
{ \
const int nrActiveThreads = min(NR_THREADS, nrIterations-(idx&(~(NR_THREADS-1)))); \
tgbuffer[threadid] = (uint) val; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0) \
{ \
for(int i=1; i<4; i++) tgbuffer[threadid+i] += tgbuffer[threadid+i-1]; \
} \
GroupMemoryBarrier(); \
if(threadid==0) \
{ \
for(int i=0x7; i<NR_THREADS; i+=0x4) tgbuffer[i] += tgbuffer[i-0x4]; \
} \
GroupMemoryBarrier(); \
uint prevblock = tgbuffer[max(1,threadid)-1]; \
GroupMemoryBarrier(); \
if((threadid&0x3)==0 && threadid>0) \
{ \
for(int i=0; i<3; i++) tgbuffer[threadid+i] += prevblock; \
} \
uint orgdata = data; \
GroupMemoryBarrier(); \
prevval = (orgdata + tgbuffer[threadid]) - val; \
if(threadid==0) data = orgdata + tgbuffer[nrActiveThreads-1]; \
GroupMemoryBarrier(); \
} \
}
#else
#define InterlockedOR(data, val, threadid) InterlockedOr(data, val)
#define InterlockedMAX(data, val, threadid) InterlockedMax(data, val)
#define InterlockedMIN(data, val, threadid) InterlockedMin(data, val)
#define InterlockedADD(data, val, threadid, idx, nrIterations) InterlockedAdd(data, val)
#define InterlockedADDAndPrev(data, val, prevval, threadid, idx, nrIterations) InterlockedAdd(data, val, prevval)
#endif
#endif

9
Assets/ScriptableRenderPipeline/fptl/LocalAtomics.hlsl.meta


fileFormatVersion: 2
guid: 02dfbc89d64584ef59e83f293ae7ee49
timeCreated: 1489193687
licenseType: Pro
ShaderImporter:
defaultTextures: []
userData:
assetBundleName:
assetBundleVariant:
正在加载...
取消
保存