|
|
|
|
|
|
// 1. 1D texture with marginal densities, telling us the likelihood of selecting a particular row, |
|
|
|
// 2. 2D texture with conditional densities, which correspond to the PDF of the texel given its row. |
|
|
|
// Ref: PBRT v3, 13.6.7 "Piecewise-Constant 2D Distributions". |
|
|
|
// Note that we use the equiareal mapping instead of the latitude-longitude one. |
|
|
|
// Note that we use the equiareal sphere-to-square mapping instead of the latitude-longitude one. |
|
|
|
|
|
|
|
#include "Common.hlsl" |
|
|
|
#include "ImageBasedLighting.hlsl" |
|
|
|
|
|
|
#define TEXTURE_HEIGHT 256 // MIS equiareal texture map: cos(theta) = 1.0 - 2.0 * v |
|
|
|
#define TEXTURE_WIDTH 2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi = TWO_PI * u |
|
|
|
#define TEXTURE_WIDTH 2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi = TWO_PI * (1.0 - u) |
|
|
|
|
|
|
|
TEXTURECUBE(envMap) // Input cubemap |
|
|
|
SAMPLERCUBE(sampler_envMap) |
|
|
|
|
|
|
#define NUM_BANKS 32 |
|
|
|
#define SHARED_MEM(x) ((x) + (x) / NUM_BANKS) |
|
|
|
|
|
|
|
// Performs a block-level parallel scan. |
|
|
|
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA". |
|
|
|
#define PARALLEL_SCAN(i, n, temp, sum) \ |
|
|
|
{ \ |
|
|
|
uint offset; \ |
|
|
|
\ |
|
|
|
/* Execute the up-sweep phase. */ \ |
|
|
|
for (offset = 1; offset <= n / 2; offset *= 2) \ |
|
|
|
{ \ |
|
|
|
GroupMemoryBarrierWithGroupSync(); \ |
|
|
|
\ |
|
|
|
/*** a1 = (2 * i + 1) * offset - 1 */ \ |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \ |
|
|
|
uint a2 = a1 + offset; \ |
|
|
|
\ |
|
|
|
if (a2 < n) \ |
|
|
|
{ \ |
|
|
|
temp[SHARED_MEM(a2)] += temp[SHARED_MEM(a1)]; \ |
|
|
|
} \ |
|
|
|
} \ |
|
|
|
\ |
|
|
|
GroupMemoryBarrierWithGroupSync(); \ |
|
|
|
\ |
|
|
|
/* Prevent NaNs arising from the division of 0 by 0. */ \ |
|
|
|
sum = max(temp[SHARED_MEM(n - 1)], FLT_MIN); \ |
|
|
|
\ |
|
|
|
GroupMemoryBarrierWithGroupSync(); \ |
|
|
|
\ |
|
|
|
/* The exclusive scan requires the last element to be 0. */ \ |
|
|
|
if (i == 0) \ |
|
|
|
{ \ |
|
|
|
temp[SHARED_MEM(n - 1)] = 0.0; \ |
|
|
|
} \ |
|
|
|
\ |
|
|
|
/* Execute the down-sweep phase. */ \ |
|
|
|
for (offset = n / 2; offset > 0; offset /= 2) \ |
|
|
|
{ \ |
|
|
|
GroupMemoryBarrierWithGroupSync(); \ |
|
|
|
\ |
|
|
|
/*** a1 = (2 * i + 1) * offset - 1 */ \ |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \ |
|
|
|
uint a2 = a1 + offset; \ |
|
|
|
\ |
|
|
|
if (a2 < n) \ |
|
|
|
{ \ |
|
|
|
float t1 = temp[SHARED_MEM(a1)]; \ |
|
|
|
temp[SHARED_MEM(a1)] = temp[SHARED_MEM(a2)]; \ |
|
|
|
temp[SHARED_MEM(a2)] += t1; \ |
|
|
|
} \ |
|
|
|
} \ |
|
|
|
\ |
|
|
|
GroupMemoryBarrierWithGroupSync(); \ |
|
|
|
} |
|
|
|
|
|
|
|
#pragma kernel ComputeConditionalDensities |
|
|
|
|
|
|
|
groupshared float rowVals[SHARED_MEM(TEXTURE_WIDTH)]; |
|
|
|
|
|
|
uint3 groupThreadId : SV_GroupThreadID) |
|
|
|
{ |
|
|
|
// There are TEXTURE_HEIGHT thread groups. |
|
|
|
// A single thread group processes a row of TEXTURE_WIDTH texels (2 per thread). |
|
|
|
// There are TEXTURE_HEIGHT thread groups processing 2 texels per thread. |
|
|
|
const uint n = TEXTURE_WIDTH; |
|
|
|
const uint i = groupThreadId.x; |
|
|
|
const uint j = groupId.x; |
|
|
|
|
|
|
float3 c1 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L1, 0).rgb; |
|
|
|
float3 c2 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L2, 0).rgb; |
|
|
|
|
|
|
|
// -------------------------------------------------------------------- |
|
|
|
// Perform a block-level parallel scan. |
|
|
|
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA". |
|
|
|
// -------------------------------------------------------------------- |
|
|
|
|
|
|
|
// Step 1: load the row of data into shared memory. |
|
|
|
uint offset; |
|
|
|
float rowValSum; |
|
|
|
// Step 2: execute the up-sweep phase. |
|
|
|
for (offset = 1; offset <= n / 2; offset *= 2) |
|
|
|
{ |
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
PARALLEL_SCAN(i, n, rowVals, rowValSum) |
|
|
|
/// a1 = (2 * i + 1) * offset - 1; |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); |
|
|
|
uint a2 = a1 + offset; |
|
|
|
|
|
|
|
if (a2 < n) |
|
|
|
{ |
|
|
|
rowVals[SHARED_MEM(a2)] += rowVals[SHARED_MEM(a1)]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
// Prevent NaNs arising from the division of 0 by 0. |
|
|
|
float rowValSum = max(rowVals[SHARED_MEM(n - 1)], FLT_MIN); |
|
|
|
// Compute the CDF. Note: the value at (i = n) is implicitly 1. |
|
|
|
conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum; |
|
|
|
conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum; |
|
|
|
// The exclusive scan requires the 1st element to be 0. |
|
|
|
rowVals[SHARED_MEM(n - 1)] = 0.0; |
|
|
|
|
|
|
|
// Step 3: execute the down-sweep phase. |
|
|
|
for (offset = n / 2; offset > 0; offset /= 2) |
|
|
|
{ |
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
/// a1 = (2 * i + 1) * offset - 1; |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); |
|
|
|
uint a2 = a1 + offset; |
|
|
|
|
|
|
|
if (a2 < n) |
|
|
|
{ |
|
|
|
float t1 = rowVals[SHARED_MEM(a1)]; |
|
|
|
rowVals[SHARED_MEM(a1)] = rowVals[SHARED_MEM(a2)]; |
|
|
|
rowVals[SHARED_MEM(a2)] += t1; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
// Compute the CDF. Note: the value at (i = n) is implicitly 1. |
|
|
|
conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum; |
|
|
|
conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum; |
|
|
|
} |
|
|
|
|
|
|
|
#pragma kernel ComputeMarginalRowDensities |
|
|
|
|
|
|
[numthreads(TEXTURE_HEIGHT / 2, 1, 1)] |
|
|
|
void ComputeMarginalRowDensities(uint3 groupThreadId : SV_GroupThreadID) |
|
|
|
{ |
|
|
|
// The size of the input is TEXTURE_HEIGHT. There is only one thread group. |
|
|
|
// There is only one thread group processing 2 texels per thread. |
|
|
|
// -------------------------------------------------------------------- |
|
|
|
// Perform a block-level parallel scan. |
|
|
|
// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA". |
|
|
|
// -------------------------------------------------------------------- |
|
|
|
|
|
|
|
// Step 1: load the row of data into shared memory. |
|
|
|
uint offset; |
|
|
|
|
|
|
|
// Step 2: execute the up-sweep phase. |
|
|
|
for (offset = 1; offset <= n / 2; offset *= 2) |
|
|
|
{ |
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
/// a1 = (2 * i + 1) * offset - 1; |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); |
|
|
|
uint a2 = a1 + offset; |
|
|
|
|
|
|
|
if (a2 < n) |
|
|
|
{ |
|
|
|
rowInts[SHARED_MEM(a2)] += rowInts[SHARED_MEM(a1)]; |
|
|
|
} |
|
|
|
} |
|
|
|
float rowIntSum; |
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
PARALLEL_SCAN(i, n, rowInts, rowIntSum) |
|
|
|
// Prevent NaNs arising from the division of 0 by 0. |
|
|
|
float rowIntSum = max(rowInts[SHARED_MEM(n - 1)], FLT_MIN); |
|
|
|
// Compute the CDF. Note: the value at (i = n) is implicitly 1. |
|
|
|
marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum; |
|
|
|
marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum; |
|
|
|
// The exclusive scan requires the 1st element to be 0. |
|
|
|
rowInts[SHARED_MEM(n - 1)] = 0.0; |
|
|
|
|
|
|
|
// Step 3: execute the down-sweep phase. |
|
|
|
for (offset = n / 2; offset > 0; offset /= 2) |
|
|
|
{ |
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
/// a1 = (2 * i + 1) * offset - 1; |
|
|
|
uint a1 = Mad24(Mad24(2, i, 1), offset, -1); |
|
|
|
uint a2 = a1 + offset; |
|
|
|
|
|
|
|
if (a2 < n) |
|
|
|
{ |
|
|
|
float t1 = rowInts[SHARED_MEM(a1)]; |
|
|
|
rowInts[SHARED_MEM(a1)] = rowInts[SHARED_MEM(a2)]; |
|
|
|
rowInts[SHARED_MEM(a2)] += t1; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
GroupMemoryBarrierWithGroupSync(); |
|
|
|
|
|
|
|
// Compute the CDF. Note: the value at (i = n) is implicitly 1. |
|
|
|
marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum; |
|
|
|
marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum; |
|
|
|
} |