// Given a cube map (passed as a 2D array), builds CDFs of two distributions: // 1. 1D texture with marginal densities, telling us the likelihood of selecting a particular row, // 2. 2D texture with conditional densities, which correspond to the PDF of the texel given its row. // Ref: PBRT v3, 13.6.7 "Piecewise-Constant 2D Distributions". // Note that we use the equiareal sphere-to-square mapping instead of the latitude-longitude one. #include "../../Core/ShaderLibrary/Common.hlsl" #include "../../Core/ShaderLibrary/ImageBasedLighting.hlsl" /* --- Input --- */ #define TEXTURE_HEIGHT 256 // Equiareal texture map: cos(theta) = 1.0 - 2.0 * v #define TEXTURE_WIDTH 2 * TEXTURE_HEIGHT // Equiareal texture map: phi = TWO_PI * (1.0 - u) TEXTURECUBE(envMap); // Input cubemap SAMPLERCUBE(sampler_envMap); /* --- Output --- */ RWTexture2D marginalRowDensities; // [(TEXTURE_HEIGHT + 1) x 1] (+ 1 for the image integral) RWTexture2D conditionalDensities; // [TEXTURE_WIDTH x TEXTURE_HEIGHT] /* --- Implementation --- */ // Creates an access pattern which avoids shared memory bank conflicts. #define NUM_BANKS 32 #define SHARED_MEM(x) ((x) + (x) / NUM_BANKS) // Performs a block-level parallel scan. // Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA". #define PARALLEL_SCAN(i, n, temp, sum) \ { \ uint offset; \ \ /* Execute the up-sweep phase. */ \ for (offset = 1; offset <= n / 2; offset *= 2) \ { \ GroupMemoryBarrierWithGroupSync(); \ \ /*** a1 = (2 * i + 1) * offset - 1 */ \ uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \ uint a2 = a1 + offset; \ \ if (a2 < n) \ { \ temp[SHARED_MEM(a2)] += temp[SHARED_MEM(a1)]; \ } \ } \ \ GroupMemoryBarrierWithGroupSync(); \ \ /* Prevent NaNs arising from the division of 0 by 0. */ \ sum = max(temp[SHARED_MEM(n - 1)], FLT_MIN); \ \ GroupMemoryBarrierWithGroupSync(); \ \ /* The exclusive scan requires the last element to be 0. */ \ if (i == 0) \ { \ temp[SHARED_MEM(n - 1)] = 0.0; \ } \ \ /* Execute the down-sweep phase. */ \ for (offset = n / 2; offset > 0; offset /= 2) \ { \ GroupMemoryBarrierWithGroupSync(); \ \ /*** a1 = (2 * i + 1) * offset - 1 */ \ uint a1 = Mad24(Mad24(2, i, 1), offset, -1); \ uint a2 = a1 + offset; \ \ if (a2 < n) \ { \ float t1 = temp[SHARED_MEM(a1)]; \ temp[SHARED_MEM(a1)] = temp[SHARED_MEM(a2)]; \ temp[SHARED_MEM(a2)] += t1; \ } \ } \ \ GroupMemoryBarrierWithGroupSync(); \ } #pragma kernel ComputeConditionalDensities groupshared float rowVals[SHARED_MEM(TEXTURE_WIDTH)]; [numthreads(TEXTURE_WIDTH / 2, 1, 1)] void ComputeConditionalDensities(uint3 groupId : SV_GroupID, uint3 groupThreadId : SV_GroupThreadID) { // There are TEXTURE_HEIGHT thread groups processing 2 texels per thread. const uint n = TEXTURE_WIDTH; const uint i = groupThreadId.x; const uint j = groupId.x; const uint i1 = i; const uint i2 = i + n / 2; float w = TEXTURE_WIDTH; float h = TEXTURE_HEIGHT; float u1 = i1 / w + 0.5 / w; float u2 = i2 / w + 0.5 / w; float v = j / h + 0.5 / h; float3 L1 = ConvertEquiarealToCubemap(u1, v); float3 L2 = ConvertEquiarealToCubemap(u2, v); float3 c1 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L1, 0).rgb; float3 c2 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L2, 0).rgb; // Compute the integral of the step function (row values). // TODO: process 4 texels per thread, and manually unroll. rowVals[SHARED_MEM(i1)] = c1.r + c1.g + c1.b; rowVals[SHARED_MEM(i2)] = c2.r + c2.g + c2.b; float rowValSum; PARALLEL_SCAN(i, n, rowVals, rowValSum) // Compute the CDF. Note: the value at (i = n) is implicitly 1. conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum; conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum; if (i == 0) { float rowIntegralValue = rowValSum / n; marginalRowDensities[uint2(j, 0)] = rowIntegralValue; } } #pragma kernel ComputeMarginalRowDensities groupshared float rowInts[SHARED_MEM(TEXTURE_HEIGHT)]; [numthreads(TEXTURE_HEIGHT / 2, 1, 1)] void ComputeMarginalRowDensities(uint3 groupThreadId : SV_GroupThreadID) { // There is only one thread group processing 2 texels per thread. const uint n = TEXTURE_HEIGHT; const uint i = groupThreadId.x; const uint i1 = i; const uint i2 = i + n / 2; // Compute the integral of the step function (row integrals). // TODO: process 4 texels per thread, and manually unroll. rowInts[SHARED_MEM(i1)] = marginalRowDensities[uint2(i1, 0)]; rowInts[SHARED_MEM(i2)] = marginalRowDensities[uint2(i2, 0)]; float rowIntSum; PARALLEL_SCAN(i, n, rowInts, rowIntSum) // Compute the CDF. Note: the value at (i = n) is implicitly 1. marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum; marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum; if (i == 0) { float imgIntegralValue = rowIntSum / n; marginalRowDensities[uint2(n, 0)] = imgIntegralValue; } }