浏览代码

separate out sort routines

separate out sort routines
/main
mmikk 8 年前
当前提交
4e597199
共有 4 个文件被更改,包括 130 次插入242 次删除
  1. 56
      Assets/ScriptableRenderLoop/fptl/lightlistbuild-bigtile.compute
  2. 62
      Assets/ScriptableRenderLoop/fptl/lightlistbuild-clustered.compute
  3. 137
      Assets/ScriptableRenderLoop/fptl/lightlistbuild.compute
  4. 117
      Assets/ScriptableRenderLoop/fptl/SortingComputeUtils.hlsl

56
Assets/ScriptableRenderLoop/fptl/lightlistbuild-bigtile.compute


#include "LightDefinitions.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#include "SortingComputeUtils.hlsl"
#define EXACT_EDGE_TESTS
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

// sort lights
sortLightList((int) t, iNrCoarseLights); // sorting on GCN as well for now since we need the culled lights weeded out.
SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIGTILE_LIGHTS_PLUSONE, t, NR_THREADS);
lightOffs = 0;
GroupMemoryBarrierWithGroupSync();

g_vLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*offs + i] = t==0 ? iNrCoarseLights : lightsListLDS[i-1];
}
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value_in==0 ? 0 : value;
#else
uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in)); // firstbithigh(0) returns -1
return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0)); // max() just in case of overflow
#endif
}
void sortLightList(int localThreadID, int length)
{
// closest pow2 integer greater than or equal to length
const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_BIGTILE_LIGHTS_PLUSONE); // N is 1 when length is zero but will still not enter first for-loop
// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
for(int t=length+localThreadID; t<N; t+=NR_THREADS) { lightsListLDS[t]=0xffffffff; } // impossible index
GroupMemoryBarrierWithGroupSync();
for(int k=2; k<=N; k=2*k)
{
for(int j=k>>1; j>0; j=j>>1)
{
for(int i=localThreadID; i<N; i+=NR_THREADS)
{
int ixj=i^j;
if((ixj)>i)
{
const unsigned int Avalue = lightsListLDS[i];
const unsigned int Bvalue = lightsListLDS[ixj];
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
if(mustSwap)
{
lightsListLDS[i]=Bvalue;
lightsListLDS[ixj]=Avalue;
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)

62
Assets/ScriptableRenderLoop/fptl/lightlistbuild-clustered.compute


#include "LightingConvexHullUtils.hlsl"
#if !defined(XBONE) && !defined(PLAYSTATION4)
#include "SortingComputeUtils.hlsl"
#endif
//#define EXACT_EDGE_TESTS
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
#define CONV_HULL_TEST_ENABLED

return length( float2(1.0/fSx,1.0/fSy) );
}
void sortLightList(int localThreadID, int n);
#ifdef EXACT_EDGE_TESTS
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane);

// sort lights
#if !defined(XBONE) && !defined(PLAYSTATION4)
sortLightList((int) t, iNrCoarseLights);
SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
#endif
//////////// cell specific code

g_logBaseBuffer[tileIDX.y*nrTilesX + tileIDX.x] = suggestedBase;
#endif
}
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value_in==0 ? 0 : value;
#else
uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in)); // firstbithigh(0) returns -1
return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0)); // max() just in case of overflow
#endif
}
void sortLightList(int localThreadID, int length)
{
// closest pow2 integer greater than or equal to length
const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES); // N is 1 when length is zero but will still not enter first for-loop
// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
for(int t=length+localThreadID; t<N; t+=NR_THREADS) { coarseList[t]=0xffffffff; } // impossible index
GroupMemoryBarrierWithGroupSync();
for(int k=2; k<=N; k=2*k)
{
for(int j=k>>1; j>0; j=j>>1)
{
for(int i=localThreadID; i<N; i+=NR_THREADS)
{
int ixj=i^j;
if((ixj)>i)
{
const unsigned int Avalue = coarseList[i];
const unsigned int Bvalue = coarseList[ixj];
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
if(mustSwap)
{
coarseList[i]=Bvalue;
coarseList[ixj]=Avalue;
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
float4 FetchPlane(int l, int p)

137
Assets/ScriptableRenderLoop/fptl/lightlistbuild.compute


#include "LightingConvexHullUtils.hlsl"
#if !defined(XBONE) && !defined(PLAYSTATION4)
#include "SortingComputeUtils.hlsl"
#endif
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

return length( float2(1.0/fSx,1.0/fSy) );
}
void sortLightList(int localThreadID, int n);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);

// sort lights
#if !defined(XBONE) && !defined(PLAYSTATION4)
sortLightList((int) t, nrLightsCombinedList);
SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif
// write lights to global buffers

}
// original version
//float2 vRay2D = float2(max(V.x,V.y), fSclProj);
//float distSqB = bIsSpotDisc ? distSq : dot(vRay2D,vRay2D);
//if( all( float3(lightData.radiusSq, fSclProj, fSclProj) > float3(distSq, sqrt(distSqB)*lightData.fPenumbra, 0.0) ) ) uVal = 1;
// previous new version
//float fDist2DSqr = bIsSpotDisc ? dot(V,V) : (maC*maC);
//if( all( float3(lightData.radiusSq, (fSclProj*fSclProj), fSclProj) > float3(distSq, fDist2DSqr*cotaSqr, fSpotNearPlane) ) ) uVal = 1;
#if 0
void merge(int l, int m, int r);
void sortLightList(int localThreadID, int n)
{
for(int curr_size=1; curr_size<=n-1; curr_size = 2*curr_size)
{
for(int left_start=localThreadID*(2*curr_size); left_start<(n-1); left_start+=NR_THREADS*(2*curr_size))
{
int mid = left_start + curr_size - 1;
int right_end = min(left_start + 2*curr_size - 1, n-1);
merge(left_start, mid, right_end);
}
GroupMemoryBarrierWithGroupSync();
}
}
//groupshared unsigned int tmpBuffer[MAX_NR_COARSE_ENTRIES];
void merge(int l, int m, int r)
{
int i, j, k;
int ol = l;
int or = m+1;
int sl = m - l + 1; // capacity is size of left list = m - l + 1;
int sr = r - m; // capacity is size of right list = r - m
unsigned int tmpBuffer[] = coarseList; // re use coarse list buffer as temp buffer.
// could do this copy more efficiently before the if-statement
// in sortLightList() but this requires another GroupMemoryBarrierWithGroupSync()
for(int i=l; i<=r; i++) tmpBuffer[i] = prunedList[i];
i = 0;
j = 0;
k = l;
while (i < sl && j < sr)
{
const uint lVal = tmpBuffer[ol+i];
const uint rVal = tmpBuffer[or+j];
bool pickLeft = lVal <= rVal;
i = pickLeft ? (i+1) : i;
j = pickLeft ? j : (j+1);
prunedList[k] = pickLeft ? lVal : rVal;
k++;
}
while (i < sl)
{
prunedList[k] = tmpBuffer[ol+i];
i++; k++;
}
while (j < sr)
{
prunedList[k] = tmpBuffer[or+j];
j++; k++;
}
}
#else
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value_in==0 ? 0 : value;
#else
uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in)); // firstbithigh(0) returns -1
return max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0)); // max() just in case of overflow
#endif
}
void sortLightList(int localThreadID, int length)
{
// closest pow2 integer greater than or equal to length
const int N = (const int) LimitPow2AndClamp((unsigned int) length, MAX_NR_COARSE_ENTRIES); // N is 1 when length is zero but will still not enter first for-loop
// bitonic sort can only handle arrays with a power of two length. Fill remaining entries with greater than possible index.
for(int t=length+localThreadID; t<N; t+=NR_THREADS) { prunedList[t]=0xffffffff; } // impossible index
GroupMemoryBarrierWithGroupSync();
for(int k=2; k<=N; k=2*k)
{
for(int j=k>>1; j>0; j=j>>1)
{
for(int i=localThreadID; i<N; i+=NR_THREADS)
{
int ixj=i^j;
if((ixj)>i)
{
const unsigned int Avalue = prunedList[i];
const unsigned int Bvalue = prunedList[ixj];
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
if(mustSwap)
{
prunedList[i]=Bvalue;
prunedList[ixj]=Avalue;
}
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
#endif
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS

117
Assets/ScriptableRenderLoop/fptl/SortingComputeUtils.hlsl


#ifndef __SORTINGCOMPUTEUTILS_H__
#define __SORTINGCOMPUTEUTILS_H__
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value_in==0 ? 0 : value;
#else
uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in)); // firstbithigh(0) returns -1
valpw2 = max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0)); // max() just in case of overflow
return min(valpw2, maxValue);
#endif
}
// have to make this sort routine a macro unfortunately because hlsl doesn't take
// groupshared memory of unspecified length as an input parameter to a function.
// maxcapacity_in must be a power of two.
// all data from length_in and up to closest power of two will be filled with 0xffffffff
#define SORTLIST(data, length_in, maxcapacity_in, localThreadID_in, nrthreads_in) \
{ \
int length=(int) length_in, maxcapacity=(int) maxcapacity_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in; \
\
const int N = (const int) LimitPow2AndClamp((unsigned int) length, (uint) maxcapacity); \
for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=0xffffffff; } \
GroupMemoryBarrierWithGroupSync(); \
\
for(int k=2; k<=N; k=2*k) \
{ \
for(int j=k>>1; j>0; j=j>>1) \
{ \
for(int i=localThreadID; i<N; i+=nrthreads) \
{ \
int ixj=i^j; \
if((ixj)>i) \
{ \
const unsigned int Avalue = data[i]; \
const unsigned int Bvalue = data[ixj]; \
\
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue; \
if(mustSwap) \
{ \
data[i]=Bvalue; \
data[ixj]=Avalue; \
} \
} \
} \
\
GroupMemoryBarrierWithGroupSync(); \
} \
} \
}
// have to make this sort routine a macro unfortunately because hlsl doesn't take
// groupshared memory of unspecified length as an input parameter to a function.
// merge-sort is not in-place so two buffers are required: data and tmpdata.
// These must both have a capacity of at least length_in entries and initial
// input is assumed to be in data and results will be delivered in data.
#define MERGESORTLIST(data, tmpdata, length_in, localThreadID_in, nrthreads_in) \
{ \
int length=(int) length_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in; \
\
for(int curr_size=1; curr_size<=length-1; curr_size = 2*curr_size) \
{ \
for(int left_start=localThreadID*(2*curr_size); left_start<(length-1); left_start+=nrthreads*(2*curr_size)) \
{ \
int mid = left_start + curr_size - 1; \
int right_end = min(left_start + 2*curr_size - 1, length-1); \
{ \
int l=left_start, m=mid, r=right_end; \
\
int i, j, k; \
\
int ol = l; \
int or = m+1; \
int sl = m - l + 1; \
int sr = r - m; \
\
for(int i=l; i<=r; i++) tmpdata[i] = data[i]; \
\
i = 0; j = 0; k = l; \
while (i < sl && j < sr) \
{ \
const uint lVal = tmpdata[ol+i]; \
const uint rVal = tmpdata[or+j]; \
bool pickLeft = lVal <= rVal; \
i = pickLeft ? (i+1) : i; \
j = pickLeft ? j : (j+1); \
data[k] = pickLeft ? lVal : rVal; \
k++; \
} \
\
while (i < sl) \
{ \
data[k] = tmpdata[ol+i]; \
i++; k++; \
} \
\
while (j < sr) \
{ \
data[k] = tmpdata[or+j]; \
j++; k++; \
} \
} \
} \
\
GroupMemoryBarrierWithGroupSync(); \
} \
}
#endif
正在加载...
取消
保存