浏览代码

Factor out PARALLEL_SCAN()

/main
Evgenii Golubev 8 年前
当前提交
33a76e5a
共有 2 个文件被更改,包括 92 次插入112 次删除
  1. 173
      Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute
  2. 31
      Assets/ScriptableRenderLoop/ShaderLibrary/ImageBasedLighting.hlsl

173
Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute


// 1. 1D texture with marginal densities, telling us the likelihood of selecting a particular row,
// 2. 2D texture with conditional densities, which correspond to the PDF of the texel given its row.
// Ref: PBRT v3, 13.6.7 "Piecewise-Constant 2D Distributions".
// Note that we use the equiareal mapping instead of the latitude-longitude one.
// Note that we use the equiareal sphere-to-square mapping instead of the latitude-longitude one.
#include "Common.hlsl"
#include "ImageBasedLighting.hlsl"

#define TEXTURE_HEIGHT 256 // MIS equiareal texture map: cos(theta) = 1.0 - 2.0 * v
#define TEXTURE_WIDTH 2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi = TWO_PI * u
#define TEXTURE_WIDTH 2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi = TWO_PI * (1.0 - u)
TEXTURECUBE(envMap) // Input cubemap
SAMPLERCUBE(sampler_envMap)

#define NUM_BANKS 32
#define SHARED_MEM(x) ((x) + (x) / NUM_BANKS)
// Performs a block-level parallel scan.
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
#define PARALLEL_SCAN(i, n, temp, sum) \
{ \
uint offset; \
\
/* Execute the up-sweep phase. */ \
for (offset = 1; offset <= n / 2; offset *= 2) \
{ \
GroupMemoryBarrierWithGroupSync(); \
\
/*** a1 = (2 * i + 1) * offset - 1 */ \
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \
uint a2 = a1 + offset; \
\
if (a2 < n) \
{ \
temp[SHARED_MEM(a2)] += temp[SHARED_MEM(a1)]; \
} \
} \
\
GroupMemoryBarrierWithGroupSync(); \
\
/* Prevent NaNs arising from the division of 0 by 0. */ \
sum = max(temp[SHARED_MEM(n - 1)], FLT_MIN); \
\
GroupMemoryBarrierWithGroupSync(); \
\
/* The exclusive scan requires the last element to be 0. */ \
if (i == 0) \
{ \
temp[SHARED_MEM(n - 1)] = 0.0; \
} \
\
/* Execute the down-sweep phase. */ \
for (offset = n / 2; offset > 0; offset /= 2) \
{ \
GroupMemoryBarrierWithGroupSync(); \
\
/*** a1 = (2 * i + 1) * offset - 1 */ \
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \
uint a2 = a1 + offset; \
\
if (a2 < n) \
{ \
float t1 = temp[SHARED_MEM(a1)]; \
temp[SHARED_MEM(a1)] = temp[SHARED_MEM(a2)]; \
temp[SHARED_MEM(a2)] += t1; \
} \
} \
\
GroupMemoryBarrierWithGroupSync(); \
}
#pragma kernel ComputeConditionalDensities
groupshared float rowVals[SHARED_MEM(TEXTURE_WIDTH)];

uint3 groupThreadId : SV_GroupThreadID)
{
// There are TEXTURE_HEIGHT thread groups.
// A single thread group processes a row of TEXTURE_WIDTH texels (2 per thread).
// There are TEXTURE_HEIGHT thread groups processing 2 texels per thread.
const uint n = TEXTURE_WIDTH;
const uint i = groupThreadId.x;
const uint j = groupId.x;

float3 c1 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L1, 0).rgb;
float3 c2 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L2, 0).rgb;
// --------------------------------------------------------------------
// Perform a block-level parallel scan.
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
// --------------------------------------------------------------------
// Step 1: load the row of data into shared memory.
uint offset;
float rowValSum;
// Step 2: execute the up-sweep phase.
for (offset = 1; offset <= n / 2; offset *= 2)
{
GroupMemoryBarrierWithGroupSync();
PARALLEL_SCAN(i, n, rowVals, rowValSum)
/// a1 = (2 * i + 1) * offset - 1;
uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
uint a2 = a1 + offset;
if (a2 < n)
{
rowVals[SHARED_MEM(a2)] += rowVals[SHARED_MEM(a1)];
}
}
GroupMemoryBarrierWithGroupSync();
// Prevent NaNs arising from the division of 0 by 0.
float rowValSum = max(rowVals[SHARED_MEM(n - 1)], FLT_MIN);
// Compute the CDF. Note: the value at (i = n) is implicitly 1.
conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum;
conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum;
// The exclusive scan requires the 1st element to be 0.
rowVals[SHARED_MEM(n - 1)] = 0.0;
// Step 3: execute the down-sweep phase.
for (offset = n / 2; offset > 0; offset /= 2)
{
GroupMemoryBarrierWithGroupSync();
/// a1 = (2 * i + 1) * offset - 1;
uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
uint a2 = a1 + offset;
if (a2 < n)
{
float t1 = rowVals[SHARED_MEM(a1)];
rowVals[SHARED_MEM(a1)] = rowVals[SHARED_MEM(a2)];
rowVals[SHARED_MEM(a2)] += t1;
}
}
GroupMemoryBarrierWithGroupSync();
// Compute the CDF. Note: the value at (i = n) is implicitly 1.
conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum;
conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum;
}
#pragma kernel ComputeMarginalRowDensities

[numthreads(TEXTURE_HEIGHT / 2, 1, 1)]
void ComputeMarginalRowDensities(uint3 groupThreadId : SV_GroupThreadID)
{
// The size of the input is TEXTURE_HEIGHT. There is only one thread group.
// There is only one thread group processing 2 texels per thread.
// --------------------------------------------------------------------
// Perform a block-level parallel scan.
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
// --------------------------------------------------------------------
// Step 1: load the row of data into shared memory.
uint offset;
// Step 2: execute the up-sweep phase.
for (offset = 1; offset <= n / 2; offset *= 2)
{
GroupMemoryBarrierWithGroupSync();
/// a1 = (2 * i + 1) * offset - 1;
uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
uint a2 = a1 + offset;
if (a2 < n)
{
rowInts[SHARED_MEM(a2)] += rowInts[SHARED_MEM(a1)];
}
}
float rowIntSum;
GroupMemoryBarrierWithGroupSync();
PARALLEL_SCAN(i, n, rowInts, rowIntSum)
// Prevent NaNs arising from the division of 0 by 0.
float rowIntSum = max(rowInts[SHARED_MEM(n - 1)], FLT_MIN);
// Compute the CDF. Note: the value at (i = n) is implicitly 1.
marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum;
marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum;
// The exclusive scan requires the 1st element to be 0.
rowInts[SHARED_MEM(n - 1)] = 0.0;
// Step 3: execute the down-sweep phase.
for (offset = n / 2; offset > 0; offset /= 2)
{
GroupMemoryBarrierWithGroupSync();
/// a1 = (2 * i + 1) * offset - 1;
uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
uint a2 = a1 + offset;
if (a2 < n)
{
float t1 = rowInts[SHARED_MEM(a1)];
rowInts[SHARED_MEM(a1)] = rowInts[SHARED_MEM(a2)];
rowInts[SHARED_MEM(a2)] += t1;
}
}
GroupMemoryBarrierWithGroupSync();
// Compute the CDF. Note: the value at (i = n) is implicitly 1.
marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum;
marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum;
}

31
Assets/ScriptableRenderLoop/ShaderLibrary/ImageBasedLighting.hlsl


return mipmapLevel / UNITY_SPECCUBE_LOD_STEPS;
}
// Performs conversion from equiareal map coordinates to cubemap (Cartesian) ones.
//-----------------------------------------------------------------------------
// Coordinate system conversion
//-----------------------------------------------------------------------------
// Converts Cartesian coordinates given in the right-handed coordinate system
// with Z pointing upwards (OpenGL style) to the coordinates in the left-handed
// coordinate system with Y pointing up (DirectX style).
float3 TransformGLtoDX(float x, float y, float z)
{
return float3(x, z, y);
}
float3 TransformGLtoDX(float3 v)
{
return v.xzy;
}
// Performs conversion from equiareal map coordinates to Cartesian (DirectX cubemap) ones.
// x = sin(theta) * sin(phi)
// y = cos(theta)
// z = sin(theta) * cos(phi)
// x = sin(theta) * cos(phi)
// y = sin(theta) * sin(phi)
// z = cos(theta)
// phi = TWO_PI * u
// phi = TWO_PI * (1.0 - u)
sincos(TWO_PI * u, sinPhi, cosPhi);
sincos(TWO_PI - TWO_PI * u, sinPhi, cosPhi);
return float3(sinTheta * sinPhi, cosTheta, sinTheta * cosPhi);
return TransformGLtoDX(sinTheta * cosPhi, sinTheta * sinPhi, cosTheta);
}
// Ref: See "Moving Frostbite to PBR" Listing 22

正在加载...
取消
保存