Build probability tables using only 2D textures (cubemaps as arrays)

8 年前 · c31ad089
--- a/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute
+++ b/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/BuildProbabilityTables.compute
-// TODO: add description
+// Given a cube map (passed as a 2D array), builds CDFs of two distributions:
+// 1. 1D texture with marginal densities, telling us the likelihood of selecting a particular row,
+// 2. 2D texture with conditional densities, which correspond to the PDF of the texel given its row.
+// Ref: PBRT v3, 13.6.7 "Piecewise-Constant 2D Distributions".
-#define textureSize 128                  // The size of the MIP level 1 of the input texture
-
-int cubeFaceId;                          // Cubemap face index
+#define TEXTURE_SIZE 256                 // The size of the input texture
+#define MIP1_SIZE    TEXTURE_SIZE / 2    // The size of the MIP level 1 of the input texture
-TEXTURE2D(envMap)                        // Cubemap face (s.t. MIP 1: [textureSize x textureSize])
+TEXTURE2D_ARRAY(envMap)                  // Cubemap as an array: [TEXTURE_SIZE x TEXTURE_SIZE x 6]
-RWTexture2D<float> marginalRowDensities; // One row per face: [textureSize x 8]
-RWTexture2D<float> conditionalDensities; // Cubemap face: [textureSize x textureSize]
+RWTexture2D<float> marginalRowDensities; // 1D texture: [(6 * MIP1_SIZE + 1) x 1]
+RWTexture2D<float> conditionalDensities; // Array: [MIP1_SIZE x (6 * MIP1_SIZE)]
-/* --- Shared --- */
+/* --- Implementation --- */
-groupshared float rowIntegralValues[textureSize];
+// Creates an access pattern which avoids shared memory bank conflicts.
+#define NUM_BANKS 32
+#define SHARED_MEM(x) ((x) + (x) / NUM_BANKS)
-/* --- Implementation --- */
+#pragma kernel ComputeConditionalDensities
-#pragma kernel BuildProabilityTables
+groupshared float rowVals[SHARED_MEM(MIP1_SIZE)];
-[numthreads(1, textureSize, 1)]
-void BuildProabilityTables(uint3 groupId          : SV_GroupID,
-                           uint3 groupThreadId    : SV_GroupThreadID,
-                           uint3 dispatchThreadId : SV_DispatchThreadID,
-                           uint  groupIndex       : SV_GroupIndex)
+[numthreads(MIP1_SIZE / 2, 1, 1)]
+void ComputeConditionalDensities(uint3 groupId       : SV_GroupID,
+                                 uint3 groupThreadId : SV_GroupThreadID)
-    // A single thread group processes a row of 'textureSize' texels.
-    const int j = groupThreadId.y;
+    // There are (MIP1_SIZE x 6) thread groups.
+    // A single thread group processes a row of MIP1_SIZE texels (2 per thread).
+    const uint n  = MIP1_SIZE;
+    const uint i  = groupThreadId.x;
+    const uint j  = groupId.x;
+    const uint k  = groupId.y;
+    const uint jk = Mad24(k, n, j);
+    const uint i1 = i;
+    const uint i2 = i + n / 2;
-    // TODO: reduce storage requirements.
-    /* HUGE */ float temp[textureSize];
+    // --------------------------------------------------------------------
+    // Compute the integral of the step function (row values).
+    // Perform a block-level parallel scan.
+    // Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
+    // TODO: process 4 texels per thread, and manually unroll.
+    // --------------------------------------------------------------------
-    // Compute the integral of the step function.
-    float rowIntegralValue = 0.0;
+    // Step 1: load the row of data into shared memory.
+    // We use MIP level 1 to account for interpolation during light sampling.
+    // Ref: PBRT v3, page 847.
+    float3 c1 = LOAD_TEXTURE2D_ARRAY_LOD(envMap, uint2(i1, j), k, 1).rgb;
+    float3 c2 = LOAD_TEXTURE2D_ARRAY_LOD(envMap, uint2(i2, j), k, 1).rgb;
+    rowVals[SHARED_MEM(i1)] = c1.r + c1.g + c1.b;
+    rowVals[SHARED_MEM(i2)] = c2.r + c2.g + c2.b;
-    // Suppress the D3D compiler warning.
-    int i;
+    uint offset;
-    // TODO: run in parallel.
-    for (i = 0; i < textureSize; i++)
+    // Step 2: execute the up-sweep phase.
+    for (offset = 1; offset <= n / 2; offset *= 2)
-        temp[i] = rowIntegralValue;
+        GroupMemoryBarrierWithGroupSync();
-        // We use MIP level 1 to account for interpolation during light sampling.
-        // Ref: PBRT v3, page 847.
-        float3 color     = LOAD_TEXTURE2D_LOD(envMap, int2(i, j), 1).rgb;
-        float  intensity = color.r + color.g + color.b;
+        /// a1 = (2 * i + 1) * offset - 1;
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
+        uint a2 = a1 + offset;
-        rowIntegralValue += intensity / textureSize;
+        if (a2 < n)
+        {
+            rowVals[SHARED_MEM(a2)] += rowVals[SHARED_MEM(a1)];
+        }
+
+    GroupMemoryBarrierWithGroupSync();
-    rowIntegralValue = max(rowIntegralValue, FLT_MIN);
+    float rowValSum = max(rowVals[SHARED_MEM(n - 1)], FLT_MIN);
-    // Compute the CDF. Note: the value at (i = textureSize) is implicitly 1.
-    // TODO: run in parallel.
-    for (i = 0; i < textureSize; i++)
+    if (i == 0)
-        conditionalDensities[int2(i, j)] = temp[i] / rowIntegralValue;
+        float rowIntegralValue = rowValSum / n;
+        marginalRowDensities[uint2(jk, 0)] = rowIntegralValue;
+        // The exclusive scan requires the 1st element to be 0.
+        rowVals[SHARED_MEM(n - 1)] = 0.0;
+    }
+
+    // Step 3: execute the down-sweep phase.
+    for (offset = n / 2; offset > 0; offset /= 2)
+    {
+        GroupMemoryBarrierWithGroupSync();
+
+        /// a1 = (2 * i + 1) * offset - 1;
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
+        uint a2 = a1 + offset;
+
+        if (a2 < n)
+        {
+            float t1 = rowVals[SHARED_MEM(a1)];
+            rowVals[SHARED_MEM(a1)]  = rowVals[SHARED_MEM(a2)];
+            rowVals[SHARED_MEM(a2)] += t1;
+        }
-    // Store the value of the integral.
-    rowIntegralValues[j] = rowIntegralValue;
-    if (groupIndex == 0)
+    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
+    conditionalDensities[uint2(i1, jk)] = rowVals[SHARED_MEM(i1)] / rowValSum;
+    conditionalDensities[uint2(i2, jk)] = rowVals[SHARED_MEM(i2)] / rowValSum;
+}
+
+#pragma kernel ComputeMarginalRowDensities
+
+groupshared float rowInts[SHARED_MEM(8 * MIP1_SIZE)];
+
+[numthreads(8 * MIP1_SIZE / 2, 1, 1)]
+void ComputeMarginalRowDensities(uint3 groupThreadId : SV_GroupThreadID)
+{
+    // The size of the input is (6 * MIP1_SIZE).
+    // However, the algorithm only works with inputs of sizes which are powers of 2,
+    // therefore there is a single thread group processing (8 * MIP1_SIZE) texels (2 per thread).
+    const uint sz = 6 * MIP1_SIZE;
+    const uint n  = 8 * MIP1_SIZE;
+    const uint i  = groupThreadId.x;
+    const uint i1 = i;
+    const uint i2 = i + n / 2;
+
+    // --------------------------------------------------------------------
+    // Compute the integral of the step function (row integrals).
+    // Perform a block-level parallel scan.
+    // Ref: GPU Gems 3, Chapter 39: "Parallel Prefix Sum (Scan) with CUDA".
+    // TODO: process 4 texels per thread, and manually unroll.
+    // --------------------------------------------------------------------
+
+    // Step 1: load the row of data into shared memory.
+    rowInts[SHARED_MEM(i1)] = (i1 < sz) ? marginalRowDensities[uint2(i1, 0)] : 0.0;
+    rowInts[SHARED_MEM(i2)] = (i2 < sz) ? marginalRowDensities[uint2(i2, 0)] : 0.0;
+
+    uint offset;
+
+    // Step 2: execute the up-sweep phase.
+    for (offset = 1; offset <= n / 2; offset *= 2)
-        // Compute the integral of the step function.
-        float imgIntegralValue = 0.0;
+        GroupMemoryBarrierWithGroupSync();
+
+        /// a1 = (2 * i + 1) * offset - 1;
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
+        uint a2 = a1 + offset;
-        // TODO: run in parallel.
-        for (i = 0; i < textureSize; i++)
+        if (a2 < n)
-            temp[i] = imgIntegralValue;
+            rowInts[SHARED_MEM(a2)] += rowInts[SHARED_MEM(a1)];
+        }
+    }
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Prevent NaNs arising from the division of 0 by 0.
+    float rowIntSum = max(rowInts[SHARED_MEM(n - 1)], FLT_MIN);
+
+    if (i == 0)
+    {
+        float imgIntegralValue = rowIntSum / sz;
+        marginalRowDensities[uint2(sz, 0)] = imgIntegralValue;
+        // The exclusive scan requires the 1st element to be 0.
+        rowInts[SHARED_MEM(n - 1)] = 0.0;
+    }
+
+    // Step 3: execute the down-sweep phase.
+    for (offset = n / 2; offset > 0; offset /= 2)
+    {
+        GroupMemoryBarrierWithGroupSync();
-            imgIntegralValue += rowIntegralValues[i] / textureSize;
-        }
+        /// a1 = (2 * i + 1) * offset - 1;
+        uint a1 = Mad24(Mad24(2, i, 1), offset, -1);
+        uint a2 = a1 + offset;
-        // Compute the CDF. Note: the value at (i = textureSize) is implicitly 1.
-        // TODO: run in parallel.
-        for (i = 0; i < textureSize; i++)
+        if (a2 < n)
-            marginalRowDensities[int2(i, cubeFaceId)] = temp[i] / imgIntegralValue;
+            float t1 = rowInts[SHARED_MEM(a1)];
+            rowInts[SHARED_MEM(a1)]  = rowInts[SHARED_MEM(a2)];
+            rowInts[SHARED_MEM(a2)] += t1;
-
-        // Store the value of the integral of the entire image.
-        // TODO: find a better place for this.
-        marginalRowDensities[int2(0, 6)] = imgIntegralValue;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // Compute the CDF. Note: the value at (i = n) is implicitly 1.
+    if (i1 < sz) { marginalRowDensities[uint2(i1, 0)] = rowInts[SHARED_MEM(i1)] / rowIntSum; }
+    if (i2 < sz) { marginalRowDensities[uint2(i2, 0)] = rowInts[SHARED_MEM(i2)] / rowIntSum; }
 }
--- a/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/GGXConvolve.shader
+++ b/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/Resources/GGXConvolve.shader

            TEXTURECUBE(_MainTex);
            SAMPLERCUBE(sampler_MainTex);
+
+            TEXTURE2D(_ConditionalDensities);
+            SAMPLER2D(sampler_ConditionalDensities)
+
+            TEXTURE2D(_MarginalRowDensities);
+            SAMPLER2D(sampler_MarginalRowDensities);
+
            float _Level;
            float _InvOmegaP;

--- a/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/SkyManager.cs
+++ b/Assets/ScriptableRenderLoop/HDRenderLoop/Sky/SkyManager.cs
 using System.Collections.Generic;
 using System;

-
 namespace UnityEngine.Experimental.ScriptableRenderLoop
 {
    [Serializable]
    {
        RenderTexture           m_SkyboxCubemapRT = null;
        RenderTexture           m_SkyboxGGXCubemapRT = null;
+        RenderTexture           m_SkyboxMarginalRowCdfRT = null;
+        RenderTexture           m_SkyboxConditionalCdfRT = null;
+
+        ComputeShader           m_BuildProbabilityTablesCS = null;
+        int                     m_ConditionalDensitiesKernel = -1;
+        int                     m_MarginalRowDensitiesKernel = -1;

        Vector4                 m_CubemapScreenSize;
        Matrix4x4[]             m_faceCameraViewProjectionMatrix = new Matrix4x4[6];
            {
                Utilities.Destroy(m_SkyboxCubemapRT);
                Utilities.Destroy(m_SkyboxGGXCubemapRT);
+                Utilities.Destroy(m_SkyboxMarginalRowCdfRT);
+                Utilities.Destroy(m_SkyboxConditionalCdfRT);

                m_UpdateRequired = true; // Special case. Even if update mode is set to OnDemand, we need to regenerate the environment after destroying the texture.
            }
                m_SkyboxGGXCubemapRT.autoGenerateMips = false;
                m_SkyboxGGXCubemapRT.filterMode = FilterMode.Trilinear;
                m_SkyboxGGXCubemapRT.Create();
+
+                // + 1 because we store the value of the integral of the cubemap at the end of the texture.
+                m_SkyboxMarginalRowCdfRT = new RenderTexture(6 * resolution / 2 + 1, 1, 1, RenderTextureFormat.RFloat);
+                m_SkyboxMarginalRowCdfRT.dimension = TextureDimension.Tex2D;
+                m_SkyboxMarginalRowCdfRT.useMipMap = false;
+                m_SkyboxMarginalRowCdfRT.autoGenerateMips = false;
+                m_SkyboxMarginalRowCdfRT.enableRandomWrite = true;
+                m_SkyboxMarginalRowCdfRT.filterMode = FilterMode.Point;
+                m_SkyboxMarginalRowCdfRT.Create();
+
+                m_SkyboxConditionalCdfRT = new RenderTexture(resolution / 2, 6 * resolution / 2, 1, RenderTextureFormat.RFloat);
+                m_SkyboxConditionalCdfRT.dimension = TextureDimension.Tex2D;
+                m_SkyboxConditionalCdfRT.useMipMap = false;
+                m_SkyboxConditionalCdfRT.autoGenerateMips = false;
+                m_SkyboxConditionalCdfRT.enableRandomWrite = true;
+                m_SkyboxConditionalCdfRT.filterMode = FilterMode.Point;
+                m_SkyboxConditionalCdfRT.Create();
            }

            m_CubemapScreenSize = new Vector4((float)resolution, (float)resolution, 1.0f / (float)resolution, 1.0f / (float)resolution);
            // TODO: We need to have an API to send our sky information to Enlighten. For now use a workaround through skybox/cubemap material...
            m_StandardSkyboxMaterial = Utilities.CreateEngineMaterial("Skybox/Cubemap");
            m_GGXConvolveMaterial = Utilities.CreateEngineMaterial("Hidden/HDRenderLoop/GGXConvolve");
+            m_BuildProbabilityTablesCS = Resources.Load<ComputeShader>("BuildProbabilityTables");
+
+            m_ConditionalDensitiesKernel = m_BuildProbabilityTablesCS.FindKernel("ComputeConditionalDensities");
+            m_MarginalRowDensitiesKernel = m_BuildProbabilityTablesCS.FindKernel("ComputeMarginalRowDensities");

            m_CurrentUpdateTime = 0.0f;
        }
            Utilities.Destroy(m_GGXConvolveMaterial);
            Utilities.Destroy(m_SkyboxCubemapRT);
            Utilities.Destroy(m_SkyboxGGXCubemapRT);
+            Utilities.Destroy(m_SkyboxMarginalRowCdfRT);
+            Utilities.Destroy(m_SkyboxConditionalCdfRT);

            if(m_Renderer != null)
                m_Renderer.Cleanup();
            }
        }

+        private void BuildProbabilityTables(RenderLoop renderLoop)
+        {
+            // Bind the input cubemap as a Texture2DArray.
+            // TODO: for some reason, Unity only binds the first face...
+            m_BuildProbabilityTablesCS.SetTexture(m_ConditionalDensitiesKernel, "envMap", m_SkyboxCubemapRT);
+
+            // Bind the outputs.
+            m_BuildProbabilityTablesCS.SetTexture(m_ConditionalDensitiesKernel, "marginalRowDensities", m_SkyboxMarginalRowCdfRT);
+            m_BuildProbabilityTablesCS.SetTexture(m_ConditionalDensitiesKernel, "conditionalDensities", m_SkyboxConditionalCdfRT);
+            m_BuildProbabilityTablesCS.SetTexture(m_MarginalRowDensitiesKernel, "marginalRowDensities", m_SkyboxMarginalRowCdfRT);
+
+            // TODO: the shader has 'TEXTURE_SIZE' hard-coded to 256!
+            int mip1Size = (int)m_SkyParameters.resolution / 2;
+
+            var cmd = new CommandBuffer() { name = "" };
+            cmd.DispatchCompute(m_BuildProbabilityTablesCS, m_ConditionalDensitiesKernel, mip1Size, 6, 1);
+            cmd.DispatchCompute(m_BuildProbabilityTablesCS, m_MarginalRowDensitiesKernel, 1, 1, 1);
+            renderLoop.ExecuteCommandBuffer(cmd);
+            cmd.Dispose();
+        }
+
+            bool useMIS = false;
+
            using (new Utilities.ProfilingSample("Sky Pass: GGX Convolution", renderLoop))
            {
                int mipCount = 1 + (int)Mathf.Log(input.width, 2.0f);
                    return;
+                }
+
+                if (useMIS)
+                {
+                    BuildProbabilityTables(renderLoop);
                }

                // Copy the first mip.

                m_GGXConvolveMaterial.SetTexture("_MainTex", input);
                m_GGXConvolveMaterial.SetFloat("_InvOmegaP", invOmegaP);
+
+                if (useMIS)
+                {
+                    m_GGXConvolveMaterial.SetTexture("_ConditionalDensities", m_SkyboxConditionalCdfRT);
+                    m_GGXConvolveMaterial.SetTexture("_MarginalRowDensities", m_SkyboxMarginalRowCdfRT);
+                }

                for (int mip = 1; mip < ((int)EnvConstants.SpecCubeLodStep + 1); ++mip)
                {
--- a/Assets/ScriptableRenderLoop/ShaderLibrary/API/D3D11.hlsl
+++ b/Assets/ScriptableRenderLoop/ShaderLibrary/API/D3D11.hlsl
 #define LOAD_TEXTURE2D(textureName, unCoord2) textureName.Load(int3(unCoord2, 0))
 #define LOAD_TEXTURE2D_LOD(textureName, unCoord2, lod) textureName.Load(int3(unCoord2, lod))
 #define LOAD_TEXTURE2D_MSAA(textureName, unCoord2, sampleIndex) textureName.Load(unCoord2, sampleIndex)
+#define LOAD_TEXTURE2D_ARRAY(textureName, unCoord2, index) textureName.Load(int4(unCoord2, index, 0))
+#define LOAD_TEXTURE2D_ARRAY_LOD(textureName, unCoord2, index, lod) textureName.Load(int4(unCoord2, index, lod))

 #define GATHER_TEXTURE2D(textureName, samplerName, coord2) textureName.Gather(samplerName, coord2)
 #define GATHER_TEXTURE2D_ARRAY(textureName, samplerName, coord2, index) textureName.Gather(samplerName, float3(coord2, index))
--- a/Assets/ScriptableRenderLoop/ShaderLibrary/Common.hlsl
+++ b/Assets/ScriptableRenderLoop/ShaderLibrary/Common.hlsl
 #define Clamp clamp
 #endif // INTRINSIC_CLAMP

+#ifndef INTRINSIC_MUL24
+int Mul24(int a, int b)
+{
+    return a * b;
+}
+
+uint Mul24(uint a, uint b)
+{
+    return a * b;
+}
+#endif // INTRINSIC_MUL24
+
+#ifndef INTRINSIC_MAD24
+int Mad24(int a, int b, int c)
+{
+    return a * b + c;
+}
+
+uint Mad24(uint a, uint b, uint c)
+{
+    return a * b + c;
+}
+#endif // INTRINSIC_MAD24
+
 #ifndef INTRINSIC_MED3
 float Med3(float a, float b, float c)
 {
--- a/Assets/ScriptableRenderLoop/ShaderLibrary/Fibonacci.hlsl
+++ b/Assets/ScriptableRenderLoop/ShaderLibrary/Fibonacci.hlsl
            int fibN2 = sampleCount;

            // These are all constants, so this loop will be optimized away.
-            for (int j = 0; j < 16; j++)
+            for (int j = 1; j < 16; j++)
            {
                if (k_FibonacciSeq[j] == fibN1)
                {