Factor out PARALLEL_SCAN()

8 年前 · 33a76e5a
--- a/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute
+++ b/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute
 // 1. 1D texture with marginal densities, telling us the likelihood of selecting a particular row,
 // 2. 2D texture with conditional densities, which correspond to the PDF of the texel given its row.
 // Ref: PBRT v3, 13.6.7 "Piecewise-Constant 2D Distributions".
-// Note that we use the equiareal mapping instead of the latitude-longitude one.
+// Note that we use the equiareal sphere-to-square mapping instead of the latitude-longitude one.

 #include "Common.hlsl"
 #include "ImageBasedLighting.hlsl"
 #define TEXTURE_HEIGHT 256                // MIS equiareal texture map: cos(theta) = 1.0 - 2.0 * v
-#define TEXTURE_WIDTH  2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi        = TWO_PI * u
+#define TEXTURE_WIDTH  2 * TEXTURE_HEIGHT // MIS equiareal texture map: phi = TWO_PI * (1.0 - u)

 TEXTURECUBE(envMap)                       // Input cubemap
 SAMPLERCUBE(sampler_envMap)
 #define NUM_BANKS 32
 #define SHARED_MEM(x) ((x) + (x) / NUM_BANKS)

+// Performs a block-level parallel scan.
+// Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
+#define PARALLEL_SCAN(i, n, temp, sum)                          \
+{                                                               \
+    uint offset;                                                \
+                                                                \
+    /* Execute the up-sweep phase. */                           \
+    for (offset = 1; offset <= n / 2; offset *= 2)              \
+    {                                                           \
+        GroupMemoryBarrierWithGroupSync();                      \
+                                                                \
+        /*** a1 = (2 * i + 1) * offset - 1 */                   \
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);            \
+        uint a2 = a1 + offset;                                  \
+                                                                \
+        if (a2 < n)                                             \
+        {                                                       \
+            temp[SHARED_MEM(a2)] += temp[SHARED_MEM(a1)];       \
+        }                                                       \
+    }                                                           \
+                                                                \
+    GroupMemoryBarrierWithGroupSync();                          \
+                                                                \
+    /* Prevent NaNs arising from the division of 0 by 0. */     \
+    sum = max(temp[SHARED_MEM(n - 1)], FLT_MIN);                \
+                                                                \
+    GroupMemoryBarrierWithGroupSync();                          \
+                                                                \
+    /* The exclusive scan requires the last element to be 0. */ \
+    if (i == 0)                                                 \
+    {                                                           \
+        temp[SHARED_MEM(n - 1)] = 0.0;                          \
+    }                                                           \
+                                                                \
+    /* Execute the down-sweep phase. */                         \
+    for (offset = n / 2; offset > 0; offset /= 2)               \
+    {                                                           \
+        GroupMemoryBarrierWithGroupSync();                      \
+                                                                \
+        /*** a1 = (2 * i + 1) * offset - 1 */                   \
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);            \
+        uint a2 = a1 + offset;                                  \
+                                                                \
+        if (a2 < n)                                             \
+        {                                                       \
+            float t1 = temp[SHARED_MEM(a1)];                    \
+            temp[SHARED_MEM(a1)]  = temp[SHARED_MEM(a2)];       \
+            temp[SHARED_MEM(a2)] += t1;                         \
+        }                                                       \
+    }                                                           \
+                                                                \
+    GroupMemoryBarrierWithGroupSync();                          \
+}
+
 #pragma kernel ComputeConditionalDensities

 groupshared float rowVals[SHARED_MEM(TEXTURE_WIDTH)];
                                 uint3 groupThreadId : SV_GroupThreadID)
 {
-    // There are TEXTURE_HEIGHT thread groups.
-    // A single thread group processes a row of TEXTURE_WIDTH texels (2 per thread).
+    // There are TEXTURE_HEIGHT thread groups processing 2 texels per thread.
    const uint n  = TEXTURE_WIDTH;
    const uint i  = groupThreadId.x;
    const uint j  = groupId.x;
    float3 c1 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L1, 0).rgb;
    float3 c2 = SAMPLE_TEXTURECUBE_LOD(envMap, sampler_envMap, L2, 0).rgb;

-    // --------------------------------------------------------------------
-    // Perform a block-level parallel scan.
-    // Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
-    // --------------------------------------------------------------------
-
-    // Step 1: load the row of data into shared memory.
-    uint offset;
+    float rowValSum;
-    // Step 2: execute the up-sweep phase.
-    for (offset = 1; offset <= n / 2; offset *= 2)
-    {
-        GroupMemoryBarrierWithGroupSync();
+    PARALLEL_SCAN(i, n, rowVals, rowValSum)
-        /// a1 = (2 * i + 1) * offset - 1;
-        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
-        uint a2 = a1 + offset;
-
-        if (a2 < n)
-        {
-            rowVals[SHARED_MEM(a2)] += rowVals[SHARED_MEM(a1)];
-        }
-    }
-
-    GroupMemoryBarrierWithGroupSync();
-
-    // Prevent NaNs arising from the division of 0 by 0.
-    float rowValSum = max(rowVals[SHARED_MEM(n - 1)], FLT_MIN);
+    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
+    conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum;
+    conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum;
-        // The exclusive scan requires the 1st element to be 0.
-        rowVals[SHARED_MEM(n - 1)] = 0.0;
-
-    // Step 3: execute the down-sweep phase.
-    for (offset = n / 2; offset > 0; offset /= 2)
-    {
-        GroupMemoryBarrierWithGroupSync();
-
-        /// a1 = (2 * i + 1) * offset - 1;
-        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
-        uint a2 = a1 + offset;
-
-        if (a2 < n)
-        {
-            float t1 = rowVals[SHARED_MEM(a1)];
-            rowVals[SHARED_MEM(a1)]  = rowVals[SHARED_MEM(a2)];
-            rowVals[SHARED_MEM(a2)] += t1;
-        }
-    }
-
-    GroupMemoryBarrierWithGroupSync();
-
-    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
-    conditionalDensities[uint2(i1, j)] = rowVals[SHARED_MEM(i1)] / rowValSum;
-    conditionalDensities[uint2(i2, j)] = rowVals[SHARED_MEM(i2)] / rowValSum;
 }

 #pragma kernel ComputeMarginalRowDensities
 [numthreads(TEXTURE_HEIGHT / 2, 1, 1)]
 void ComputeMarginalRowDensities(uint3 groupThreadId : SV_GroupThreadID)
 {
-    // The size of the input is TEXTURE_HEIGHT. There is only one thread group.
+    // There is only one thread group processing 2 texels per thread.
-    // --------------------------------------------------------------------
-    // Perform a block-level parallel scan.
-    // Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
-    // --------------------------------------------------------------------
-
-    // Step 1: load the row of data into shared memory.
-    uint offset;
-
-    // Step 2: execute the up-sweep phase.
-    for (offset = 1; offset <= n / 2; offset *= 2)
-    {
-        GroupMemoryBarrierWithGroupSync();
-
-        /// a1 = (2 * i + 1) * offset - 1;
-        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
-        uint a2 = a1 + offset;
-
-        if (a2 < n)
-        {
-            rowInts[SHARED_MEM(a2)] += rowInts[SHARED_MEM(a1)];
-        }
-    }
+    float rowIntSum;
-    GroupMemoryBarrierWithGroupSync();
+    PARALLEL_SCAN(i, n, rowInts, rowIntSum)
-    // Prevent NaNs arising from the division of 0 by 0.
-    float rowIntSum = max(rowInts[SHARED_MEM(n - 1)], FLT_MIN);
+    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
+    marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum;
+    marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum;
-        // The exclusive scan requires the 1st element to be 0.
-        rowInts[SHARED_MEM(n - 1)] = 0.0;
-
-    // Step 3: execute the down-sweep phase.
-    for (offset = n / 2; offset > 0; offset /= 2)
-    {
-        GroupMemoryBarrierWithGroupSync();
-
-        /// a1 = (2 * i + 1) * offset - 1;
-        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
-        uint a2 = a1 + offset;
-
-        if (a2 < n)
-        {
-            float t1 = rowInts[SHARED_MEM(a1)];
-            rowInts[SHARED_MEM(a1)]  = rowInts[SHARED_MEM(a2)];
-            rowInts[SHARED_MEM(a2)] += t1;
-        }
-    }
-
-    GroupMemoryBarrierWithGroupSync();
-
-    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
-    marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum;
-    marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum;
 }
--- a/Assets/ScriptableRenderLoop/ShaderLibrary/ImageBasedLighting.hlsl
+++ b/Assets/ScriptableRenderLoop/ShaderLibrary/ImageBasedLighting.hlsl
    return mipmapLevel / UNITY_SPECCUBE_LOD_STEPS;
 }

-// Performs conversion from equiareal map coordinates to cubemap (Cartesian) ones.
+//-----------------------------------------------------------------------------
+// Coordinate system conversion
+//-----------------------------------------------------------------------------
+
+// Converts Cartesian coordinates given in the right-handed coordinate system
+// with Z pointing upwards (OpenGL style) to the coordinates in the left-handed
+// coordinate system with Y pointing up (DirectX style).
+float3 TransformGLtoDX(float x, float y, float z)
+{
+    return float3(x, z, y);
+}
+
+float3 TransformGLtoDX(float3 v)
+{
+    return v.xzy;
+}
+
+// Performs conversion from equiareal map coordinates to Cartesian (DirectX cubemap) ones.
-    //     x = sin(theta) * sin(phi)
-    //     y = cos(theta)
-    //     z = sin(theta) * cos(phi)
+    //     x = sin(theta) * cos(phi)
+    //     y = sin(theta) * sin(phi)
+    //     z = cos(theta)
-    //     phi        = TWO_PI * u
+    //     phi        = TWO_PI * (1.0 - u)
-    sincos(TWO_PI * u, sinPhi, cosPhi);
+    sincos(TWO_PI - TWO_PI * u, sinPhi, cosPhi);
-    return float3(sinTheta * sinPhi, cosTheta, sinTheta * cosPhi);
+    return TransformGLtoDX(sinTheta * cosPhi, sinTheta * sinPhi, cosTheta);
 }

 // Ref: See "Moving Frostbite to PBR" Listing 22