Revert "Revert " Reduce the number of dependent texture reads for SSS""

7 年前 · 39cb603c
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/Common.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/Common.hlsl
 // unsigned integer bit field extract implementation
 uint BitFieldExtract(uint data, uint numBits, uint offset)
 {
-    uint mask = 0xFFFFFFFFu >> (32u - numBits);
+    uint mask = UINT_MAX >> (32u - numBits);
    return (data >> offset) & mask;
 }
 #endif // INTRINSIC_BITFIELD_EXTRACT
 #endif // INTRINSIC_CUBEMAP_FACE_ID

 // ----------------------------------------------------------------------------
-// Common math definition and fastmath function
+// Common math functions
-#define PI          3.14159265359
-#define TWO_PI      6.28318530718
-#define FOUR_PI     12.56637061436
-#define INV_PI      0.31830988618
-#define INV_TWO_PI  0.15915494309
-#define INV_FOUR_PI 0.07957747155
-#define HALF_PI     1.57079632679
-#define INV_HALF_PI 0.636619772367
-#define INFINITY    asfloat(0x7F800000)
-#define LOG2_E      1.44269504089
-
-#define FLT_EPSILON 1.192092896e-07 // Smallest positive number, such that 1.0 + FLT_EPSILON != 1.0
-#define FLT_MIN     1.175494351e-38 // Minimum representable positive floating-point number
-#define FLT_MAX     3.402823466e+38 // Maximum representable floating-point number
-
-#define HFLT_MIN    0.00006103515625 // 2^14  it is the same for 10, 11 and 16bit float. ref: https://www.khronos.org/opengl/wiki/Small_Float_Formats
-
 float DegToRad(float deg)
 {
    return deg * (PI / 180.0);
 // Using pow often result to a warning like this
 // "pow(f, e) will not work for negative f, use abs(f) or conditionally handle negative values if you expect them"
 // PositivePow remove this warning when you know the value is positive and avoid inf/NAN.
-TEMPLATE_2_FLT(PositivePow, base, power, return pow(max(abs(base), FLT_EPSILON), power))
+TEMPLATE_2_FLT(PositivePow, base, power, return pow(max(abs(base), FLT_EPS), power))

 // Ref: https://twitter.com/SebAaltonen/status/878250919879639040
 // 2 mads (mad_sat and mad), faster than regular sign
 // Z buffer to linear depth.
 // Correctly handles oblique view frustums. Only valid for projection matrices!
 // Ref: An Efficient Depth Linearization Method for Oblique View Frustums, Eq. 6.
-float LinearEyeDepth(float2 positionSS, float depthRaw, float4 invProjParam)
+float LinearEyeDepth(float2 positionSS, float deviceDepth, float4 invProjParam)
-    float4 positionCS = float4(positionSS * 2.0 - 1.0, depthRaw, 1.0);
+    float4 positionCS = float4(positionSS * 2.0 - 1.0, deviceDepth, 1.0);
+}
+
+// Z buffer to linear depth.
+// Correctly handles oblique view frustums.
+// Typically, this is the cheapest variant, provided you've already computed 'positionWS'.
+float LinearEyeDepth(float3 positionWS, float3x3 viewProjMatrix)
+{
+    return mul(viewProjMatrix, float4(positionWS, 1.0)).w;
 }

 // ----------------------------------------------------------------------------
    return positionSS;
 }

-float4 ComputeClipSpacePosition(float2 positionSS, float depthRaw)
+float4 ComputeClipSpacePosition(float2 positionSS, float deviceDepth)
-    return float4(positionSS * 2.0 - 1.0, depthRaw, 1.0);
+    return float4(positionSS * 2.0 - 1.0, deviceDepth, 1.0);
-float3 ComputeViewSpacePosition(float2 positionSS, float depthRaw, float4x4 invProjMatrix)
+float3 ComputeViewSpacePosition(float2 positionSS, float deviceDepth, float4x4 invProjMatrix)
-    float4 positionCS = ComputeClipSpacePosition(positionSS, depthRaw);
+    float4 positionCS = ComputeClipSpacePosition(positionSS, deviceDepth);
    float4 positionVS = mul(invProjMatrix, positionCS);
    // The view space uses a right-handed coordinate system.
    positionVS.z = -positionVS.z;
-float3 ComputeWorldSpacePosition(float2 positionSS, float depthRaw, float4x4 invViewProjMatrix)
+float3 ComputeWorldSpacePosition(float2 positionSS, float deviceDepth, float4x4 invViewProjMatrix)
-    float4 positionCS  = ComputeClipSpacePosition(positionSS, depthRaw);
+    float4 positionCS  = ComputeClipSpacePosition(positionSS, deviceDepth);
    float4 hpositionWS = mul(invViewProjMatrix, positionCS);
    return hpositionWS.xyz / hpositionWS.w;
 }

 struct PositionInputs
 {
-    // Normalize screen position (offset by 0.5)
-    float2 positionSS;
-    // Unormalize screen position (offset by 0.5)
-    uint2 unPositionSS;
-    uint2 unTileCoord;
-
-    float depthRaw; // raw depth from depth buffer
-    float depthVS;
-
-    float3 positionWS;
+    // TODO: improve the naming convention.
+    // Some options:
+    // positionNDC,   positionSS,   tileCoordSS
+    // pixelCoordUV,  pixelCoordSS, tileCoordSS
+    // pixelCoordSS,  pixelIndexSS, tileIndexSS
+    float3 positionWS;   // World space position (could be camera-relative)
+    float2 positionSS;   // Screen space pixel position : [0, 1) (with the half-pixel offset)
+    uint2  unPositionSS; // Screen space pixel index    : [0, NumPixels)
+    uint2  unTileCoord;  // Screen space tile  index    : [0, NumTiles)
+    float  deviceDepth;  // Depth from the depth buffer : [0, 1]
+    float  linearDepth;  // View space Z coordinate     : [Near, Far]
 };

 // This function is use to provide an easy way to sample into a screen texture, either from a pixel or a compute shaders.
 }

 // From forward
-// depthRaw and depthVS come directly form .zw of SV_Position
-void UpdatePositionInput(float depthRaw, float depthVS, float3 positionWS, inout PositionInputs posInput)
+// deviceDepth and linearDepth come directly from .zw of SV_Position
+void UpdatePositionInput(float deviceDepth, float linearDepth, float3 positionWS, inout PositionInputs posInput)
-    posInput.depthRaw   = depthRaw;
-    posInput.depthVS    = depthVS;
-    posInput.positionWS = positionWS;
+    posInput.deviceDepth = deviceDepth;
+    posInput.linearDepth = linearDepth;
+    posInput.positionWS  = positionWS;
-void UpdatePositionInput(float depthRaw, float4x4 invViewProjMatrix, float4x4 viewProjMatrix, inout PositionInputs posInput)
+void UpdatePositionInput(float deviceDepth, float4x4 invViewProjMatrix, float4x4 viewProjMatrix, inout PositionInputs posInput)
-    posInput.depthRaw = depthRaw;
-
-    posInput.positionWS = ComputeWorldSpacePosition(posInput.positionSS, depthRaw, invViewProjMatrix);
+    posInput.deviceDepth = deviceDepth;
+    posInput.positionWS  = ComputeWorldSpacePosition(posInput.positionSS, deviceDepth, invViewProjMatrix);
-    posInput.depthVS = mul(viewProjMatrix, float4(posInput.positionWS, 1.0)).w;
+    posInput.linearDepth = mul(viewProjMatrix, float4(posInput.positionWS, 1.0)).w;
 }

 // The view direction 'V' points towards the camera.
    posInput.positionWS += depthOffsetVS * (-V);

-    float4 positionCS = mul(viewProjMatrix, float4(posInput.positionWS, 1.0));
-    posInput.depthVS  = positionCS.w;
-    posInput.depthRaw = positionCS.z / positionCS.w;
+    float4 positionCS    = mul(viewProjMatrix, float4(posInput.positionWS, 1.0));
+    posInput.linearDepth = positionCS.w;
+    posInput.deviceDepth = positionCS.z / positionCS.w;
 }

 // ----------------------------------------------------------------------------
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/CommonLighting.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/CommonLighting.hlsl
 // These clamping function to max of floating point 16 bit are use to prevent INF in code in case of extreme value
 float ClampToFloat16Max(float value)
 {
-    return min(value, 65504.0);
+    return min(value, HALF_MAX);
-    return min(value, 65504.0);
+    return min(value, HALF_MAX);
-    return min(value, 65504.0);
+    return min(value, HALF_MAX);
-    return min(value, 65504.0);
+    return min(value, HALF_MAX);
 }

 // Ligthing convention
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/CommonMaterial.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/CommonMaterial.hlsl
 // all pixels which belong to an SSS material are not black (those that don't always are).
 float3 TagLightingForSSS(float3 subsurfaceLighting)
 {
-    subsurfaceLighting.r = max(subsurfaceLighting.r, HFLT_MIN);
+    subsurfaceLighting.r = max(subsurfaceLighting.r, HALF_MIN);
    return subsurfaceLighting;
 }

--- a/ScriptableRenderPipeline/Core/ShaderLibrary/ImageBasedLighting.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/ImageBasedLighting.hlsl
    float m = PerceptualRoughnessToRoughness(perceptualRoughness);

    // Remap to spec power. See eq. 21 in --> https://dl.dropboxusercontent.com/u/55891920/papers/mm_brdf.pdf
-    float n = (2.0 / max(FLT_EPSILON, m * m)) - 2.0;
+    float n = (2.0 / max(FLT_EPS, m * m)) - 2.0;
-    n /= (4.0 * max(NdotR, FLT_EPSILON));
+    n /= (4.0 * max(NdotR, FLT_EPS));

    // remap back to square root of real roughness (0.25 include both the sqrt root of the conversion and sqrt for going from roughness to perceptualRoughness)
    perceptualRoughness = pow(2.0 / (n + 2.0), 0.25);
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/Macros.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/Macros.hlsl
 #define SAMPLE_TEXTURECUBE_ARRAY_LOD_ABSTRACT(textureName, samplerName, coord3, index, lod) SAMPLE_TEXTURECUBE_ARRAY_LOD(textureName, samplerName, coord3, index, lod)
 #endif

+#define PI          3.14159265358979323846
+#define TWO_PI      6.28318530717958647693
+#define FOUR_PI     12.5663706143591729538
+#define INV_PI      0.31830988618379067154
+#define INV_TWO_PI  0.15915494309189533577
+#define INV_FOUR_PI 0.07957747154594766788
+#define HALF_PI     1.57079632679489661923
+#define INV_HALF_PI 0.63661977236758134308
+#define LOG2_E      1.44269504088896340736
+#define INFINITY    asfloat(0x7F800000)
+
+#define FLT_EPS     1.192092896e-07	// Smallest positive number, such that 1.0 + FLT_EPS != 1.0
+#define FLT_MIN     1.175494351e-38	// Minimum representable positive floating-point number
+#define FLT_MAX     3.402823466e+38	// Maximum representable floating-point number
+#define HALF_MIN    6.103515625e-5  // 2^-14, the same value for 10, 11 and 16-bit: https://www.khronos.org/opengl/wiki/Small_Float_Formats
+#define HALF_MAX    65504.0
+#define UINT_MAX    0xFFFFFFFFu
+
 #define TEMPLATE_1_FLT(FunctionName, Parameter1, FunctionBody) \
 float  FunctionName(float  Parameter1) { FunctionBody; } \
 float2 FunctionName(float2 Parameter1) { FunctionBody; } \
--- a/ScriptableRenderPipeline/Core/ShaderLibrary/NormalSurfaceGradient.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/NormalSurfaceGradient.hlsl
 float3 SurfaceGradientFromPerturbedNormal(float3 nrmVertexNormal, float3 v)
 {
    float3 n = nrmVertexNormal;
-    float s = 1.0 / max(FLT_EPSILON, abs(dot(n, v)));
+    float s = 1.0 / max(FLT_EPS, abs(dot(n, v)));
    return s * (dot(n, v) * n - v);
 }

--- a/ScriptableRenderPipeline/Core/ShaderLibrary/Packing.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/Packing.hlsl
 // Packs an integer stored using at most 'numBits' into a [0..1] float.
 float PackInt(uint i, uint numBits)
 {
-    uint maxInt = 0xFFFFFFFFu >> (32u - numBits);
+    uint maxInt = UINT_MAX >> (32u - numBits);
    return saturate(i * rcp(maxInt));
 }

-    uint maxInt = 0xFFFFFFFFu >> (32u - numBits);
+    uint maxInt = UINT_MAX >> (32u - numBits);
    return (uint)(f * maxInt + 0.5); // Round instead of truncating
 }


 float UnpackUIntToFloat(uint src, uint numBits, uint offset)
 {
-    uint maxInt = 0xFFFFFFFFu >> (32u - numBits);
+    uint maxInt = UINT_MAX >> (32u - numBits);
    return float(BitFieldExtract(src, numBits, offset)) * rcp(maxInt);
 }

--- a/ScriptableRenderPipeline/Core/ShaderLibrary/VolumeRendering.hlsl
+++ b/ScriptableRenderPipeline/Core/ShaderLibrary/VolumeRendering.hlsl
 // Absorption coefficient from Disney: http://blog.selfshadow.com/publications/s2015-shading-course/burley/s2015_pbs_disney_bsdf_notes.pdf
 float3 TransmittanceColorAtDistanceToAbsorption(float3 transmittanceColor, float atDistance)
 {
-    return -log(transmittanceColor + FLT_EPSILON) / max(atDistance, FLT_EPSILON);
+    return -log(transmittanceColor + FLT_EPS) / max(atDistance, FLT_EPS);
 }


--- a/ScriptableRenderPipeline/Core/Shadow/Shadow.cs
+++ b/ScriptableRenderPipeline/Core/Shadow/Shadow.cs
            }

            m_TmpSortKeys.Sort( new SortReverter() );
-            m_TmpSortKeys.ExtractTo( shadowRequests, 0, out shadowRequestsCount, delegate(long key) { return (int) (key & 0xffffffff); } );
+            m_TmpSortKeys.ExtractTo( shadowRequests, 0, out shadowRequestsCount, delegate(long key) { return (int) (key & UINT_MAX); } );
        }

        protected override void PruneShadowCasters( Camera camera, List<VisibleLight> lights, ref VectorArray<int> shadowRequests, ref ShadowRequestVector requestsGranted, out uint totalRequestCount )
--- a/ScriptableRenderPipeline/HDRenderPipeline/Debug/DebugViewMaterialGBuffer.shader
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Debug/DebugViewMaterialGBuffer.shader

                BSDFData bsdfData;
                BakeLightingData bakeLightingData;
-                DECODE_FROM_GBUFFER(posInput.unPositionSS, 0xFFFFFFFF, bsdfData, bakeLightingData.bakeDiffuseLighting);
+                DECODE_FROM_GBUFFER(posInput.unPositionSS, UINT_MAX, bsdfData, bakeLightingData.bakeDiffuseLighting);
                #ifdef SHADOWS_SHADOWMASK
                DecodeShadowMask(LOAD_TEXTURE2D(_ShadowMaskTexture, posInput.unPositionSS), bakeLightingData.bakeShadowMask);
                #endif

                if (_DebugViewMaterial == DEBUGVIEWGBUFFER_DEPTH)
                {
-                    float linearDepth = frac(posInput.depthVS * 0.1);
+                    float linearDepth = frac(posInput.linearDepth * 0.1);
                    result = linearDepth.xxx;
                }
                // Caution: This value is not the same than the builtin data bakeDiffuseLighting. It also include emissive and multiply by the albedo
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/ClusteredUtils.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/ClusteredUtils.hlsl
 float SuggestLogBase50(float tileFarPlane)
 {
    const float C = (float)(1 << g_iLog2NumClusters);
-    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPSILON, 1.0);
+    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
    float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);      //
    return max(g_fClustBase, suggested_base);
 }
 {
    const float C = (float)(1 << g_iLog2NumClusters);
-    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPSILON, 1.0);
+    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
    float suggested_base = pow((1 / 2.3) * max(0.0, (0.8 / rangeFittedDistance) - 1), 4.0 / (C * 2));     // approximate inverse of d*x^4 + (-x) + (1-d) = 0       - d is normalized distance
    return max(g_fClustBase, suggested_base);
 }
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/SortingComputeUtils.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/SortingComputeUtils.hlsl
 // have to make this sort routine a macro unfortunately because hlsl doesn't take
 // groupshared memory of unspecified length as an input parameter to a function.
 // maxcapacity_in must be a power of two.
-// all data from length_in and up to closest power of two will be filled with 0xffffffff
+// all data from length_in and up to closest power of two will be filled with UINT_MAX
-    for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=0xffffffff; }              \
+    for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=UINT_MAX; }              \
    GroupMemoryBarrierWithGroupSync();                                                      \
                                                                                            \
    for(int k=2; k<=N; k=2*k)                                                               \
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/TilePassLoop.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/TilePassLoop.hlsl
        logBase = g_logBaseBuffer[tileIndex.y * _NumTileClusteredX + tileIndex.x];
    }

-    int clustIdx = SnapToClusterIdxFlex(posInput.depthVS, logBase, g_isLogBaseBufferEnabled != 0);
+    int clustIdx = SnapToClusterIdxFlex(posInput.linearDepth, logBase, g_isLogBaseBufferEnabled != 0);

    int nrClusters = (1 << g_iLog2NumClusters);
    const int idx = ((lightCategory * nrClusters + clustIdx) * _NumTileClusteredY + tileIndex.y) * _NumTileClusteredX + tileIndex.x;
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/lightlistbuild-bigtile.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/lightlistbuild-bigtile.compute
 		SFiniteLightBound lgtDat = g_data[lightsListLDS[l]];

 		if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )
-			lightsListLDS[l]=0xffffffff;
+			lightsListLDS[l]=UINT_MAX;
 	}

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 				int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));

 				bool bFoundSepPlane = (resh*resf)<0;
-				if(bFoundSepPlane) lightsListLDS[l]=0xffffffff;
+				if(bFoundSepPlane) lightsListLDS[l]=UINT_MAX;
 			}
 		}
 	}
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/lightlistbuild-clustered.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/lightlistbuild-clustered.compute
 		SFiniteLightBound lgtDat = g_data[coarseList[l]];

 		if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )
-			coarseList[l]=0xffffffff;
+			coarseList[l]=UINT_MAX;
 	}

 #if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
 		int offs = 0;
 		for(int l=0; l<iNrCoarseLights; l++)
 		{
-			if(coarseList[l]!=0xffffffff)
+			if(coarseList[l]!=UINT_MAX)
 				coarseList[offs++] = coarseList[l];
 		}
 		lightOffsSph = offs;
--- a/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/scrbound.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Lighting/TilePass/scrbound.compute
 				float fW = vPnts[k].w;
 				float fS = fW<0 ? -1 : 1;
 				float fWabs = fW<0 ? (-fW) : fW;
-				fW = fS * (fWabs<FLT_EPSILON ? FLT_EPSILON : fWabs);
+				fW = fS * (fWabs<FLT_EPS ? FLT_EPS : fWabs);
 				float3 vP = float3(vPnts[k].x/fW, vPnts[k].y/fW, vPnts[k].z/fW);
 				if(k==0) { vMin=vP; vMax=vP; }

--- a/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Lit.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Lit.hlsl
    float depth = LinearEyeDepth(pyramidDepth, _ZBufferParams);

    // Distance from point to the back plane
-    float depthFromPositionInput = depth - posInputs.depthVS;
+    float depthFromPositionInput = depth - posInputs.linearDepth;

    float offset = dot(-V, positionWS - posInputs.positionWS);
    float depthFromPosition = depthFromPositionInput - offset;
 // If a user do a lighting architecture without material classification, this can be remove
 #include "../../Lighting/TilePass/TilePass.cs.hlsl"

-static int g_FeatureFlags = 0xFFFFFFFF;
+static uint g_FeatureFlags = UINT_MAX;
-bool HasMaterialFeatureFlag(int flag)
+bool HasMaterialFeatureFlag(uint flag)
 {
    return ((g_FeatureFlags & flag) != 0);
 }

    // The material features system for material classification must allow compile time optimization (i.e everything should be static)
    // Note that as we store materialId for Aniso based on content of RT2 we need to add few extra condition.
-    // The code is also call from MaterialFeatureFlagsFromGBuffer, so must work fully dynamic if featureFlags is 0xFFFFFFFF
+    // The code is also call from MaterialFeatureFlagsFromGBuffer, so must work fully dynamic if featureFlags is UINT_MAX
    int supportsStandard = HasMaterialFeatureFlag(MATERIALFEATUREFLAGS_LIT_STANDARD);
    int supportsSSS = HasMaterialFeatureFlag(MATERIALFEATUREFLAGS_LIT_SSS);
    int supportsAniso = HasMaterialFeatureFlag(MATERIALFEATUREFLAGS_LIT_ANISO);

    DecodeFromGBuffer(
        unPositionSS,
-        0xFFFFFFFF,
+        UINT_MAX,
        bsdfData,
        unused
    );
    float diffuseFGD;

    // Area lights (17 VGPRs)
+    // TODO: 'orthoBasisViewNormal' is just a rotation around the normal and should thus be just 1x VGPR.
    float3x3 orthoBasisViewNormal; // Right-handed view-dependent orthogonal basis around the normal (6x VGPRs)
    float3x3 ltcTransformDiffuse;  // Inverse transformation for Lambertian or Disney Diffuse        (4x VGPRs)
    float3x3 ltcTransformSpecular; // Inverse transformation for GGX                                 (4x VGPRs)
        float NdotL = saturate(dot(bsdfData.coatNormalWS, L));
        float NdotV = preLightData.coatNdotV;
        float LdotV = dot(L, V);
-        float invLenLV = rsqrt(max(2 * LdotV + 2, FLT_EPSILON));
+        float invLenLV = rsqrt(max(2 * LdotV + 2, FLT_EPS));
        float NdotH = saturate((NdotL + NdotV) * invLenLV);
        float LdotH = saturate(invLenLV * LdotV + invLenLV);

    float NdotL    = saturate(dot(bsdfData.normalWS, L)); // Must have the same value without the clamp
    float NdotV    = preLightData.NdotV;                  // Get the unaltered (geometric) version
    float LdotV    = dot(L, V);
-    float invLenLV = rsqrt(max(2 * LdotV + 2, FLT_EPSILON)); // invLenLV = rcp(length(L + V)) - caution about the case where V and L are opposite, it can happen, use max to avoid this
+    float invLenLV = rsqrt(max(2 * LdotV + 2, FLT_EPS));  // invLenLV = rcp(length(L + V)) - caution about the case where V and L are opposite, it can happen, use max to avoid this
    float NdotH    = saturate((NdotL + NdotV) * invLenLV);
    float LdotH    = saturate(invLenLV * LdotV + invLenLV);

 #endif

 #ifdef SHADOWS_SHADOWMASK
-        float fade = saturate(posInput.depthVS * lightData.fadeDistanceScaleAndBias.x + lightData.fadeDistanceScaleAndBias.y);
+        float fade = saturate(posInput.linearDepth * lightData.fadeDistanceScaleAndBias.x + lightData.fadeDistanceScaleAndBias.y);

        // See comment in EvaluateBSDF_Punctual
        shadow = lightData.dynamicShadowCasterOnly ? min(shadowMask, shadow) : shadow;

    // Exit if texel is out of color buffer
    // Or if the texel is from an object in front of the object
-    if (refractedBackPointDepth < posInput.depthVS
+    if (refractedBackPointDepth < posInput.linearDepth
        || any(refractedBackPointSS < 0.0)
        || any(refractedBackPointSS > 1.0))
    {
--- a/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute
 #endif
 groupshared bool   processGroup;

-bool StencilTest(int2 pixelCoord, float stencilRef)
-{
-    bool passedStencilTest;
-
-#if SSS_SAMPLE_TEST_HTILE
-    int2 tileCoord = pixelCoord / 8;
-
-    // Perform the stencil test (reject at the tile rate).
-    passedStencilTest = stencilRef == LOAD_TEXTURE2D(_HTile, tileCoord).r;
-
-    [branch] if (passedStencilTest)
-#else
-    // It is extremely uncommon for individual samples to fail the HTile test.
-    // Unfortunately, our copy of HTile does not allow to accept at the tile rate.
-    // Therefore, we choose not to perform the HiS test here.
-#endif
-    {
-        // Unfortunately, our copy of HTile does not allow to accept at the tile rate.
-        // Therefore, we have to additionally perform the stencil test at the pixel rate.
-        // We check the tagged irradiance buffer to avoid an extra stencil texture fetch.
-        passedStencilTest = TestLightingForSSS(LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb);
-    }
-
-    return passedStencilTest;
-}
-
 #if SSS_USE_LDS_CACHE
 float4 LoadSampleFromCacheMemory(int2 cacheCoord)
 {
 // Returns {irradiance, linearDepth}.
 float4 LoadSample(int2 pixelCoord, int2 cacheAnchor)
 {
+#if SSS_USE_LDS_CACHE
-#if SSS_USE_LDS_CACHE
    [branch] if (isInCache)
    {
        return LoadSampleFromCacheMemory(cacheCoord);
    {
-        float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;
-
-        [branch] if (StencilTest(pixelCoord, stencilRef))
-        {
-            return LoadSampleFromVideoMemory(pixelCoord);
-        }
-        else
-        {
-            return float4(0, 0, 0, 0);
-        }
+        // Always load both irradiance and depth.
+        // Avoid dependent texture reads at the cost of extra bandwidth.
+        return LoadSampleFromVideoMemory(pixelCoord);
    }
 }

    if (TestLightingForSSS(irradiance))
    {
        // Apply bilateral weighting.
-        float  linearDepth = textureSample.a;
-        float  z = linearDepth - centerPosVS.z;
-        float  p = _FilterKernels[profileID][i][iP];
-        float3 w = ComputeBilateralWeight(xy2, z, mmPerUnit, shapeParam, p);
+        float  viewZ  = textureSample.a;
+        float  relZ   = viewZ - centerPosVS.z;
+        float  rcpPdf = _FilterKernels[profileID][i][iP];
+        float3 weight = ComputeBilateralWeight(xy2, relZ, mmPerUnit, shapeParam, rcpPdf);
-        totalIrradiance += w * irradiance;
-        totalWeight     += w;
+        totalIrradiance += weight * irradiance;
+        totalWeight     += weight;
    }
    else
    {

    [branch] if (!processGroup) { return; }

-    float3 centerIrradiance = 0;
-    float  centerDepth      = 0;
-    float4 cachedValue      = 0;
-
-    bool passedStencilTest = StencilTest((int2)pixelCoord, stencilRef);
+    float3 centerIrradiance  = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
+    float  centerDepth       = 0;
+    float  centerViewZ       = 0;
+    bool   passedStencilTest = TestLightingForSSS(centerIrradiance);
+    // Save some bandwidth by only loading depth values for SSS pixels.
-        centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
-        centerDepth      = LOAD_TEXTURE2D(_DepthTexture,     pixelCoord).r;
-        cachedValue      = float4(centerIrradiance, LinearEyeDepth(centerDepth, _ZBufferParams));
+        centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
+        centerViewZ = LinearEyeDepth(centerDepth, _ZBufferParams);
-    textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = cachedValue;
+    textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = float4(centerIrradiance, centerViewZ);

    uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;
    uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;
        uint2 quadCoord;

        // The traversal order is such that the quad's X coordinate is monotonically increasing.
+        // The corner is always the near the block of the corresponding wavefront.
-            case 0:
+            case 0:  // Bottom left
-            case 1:
+            case 1:  // Bottom right
-            case 2:
+            case 2:  // Top left
-            default: // 3
+            default: // Top right
-        uint2  cacheCoord2  = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
-        int2   pixelCoord2  = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
-        float4 cachedValue2 = 0;
+        uint2  cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
+        int2   pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
+        float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
+        float  viewZ2      = 0;
-        [branch] if (StencilTest(pixelCoord2, stencilRef))
+        // Save some bandwidth by only loading depth values for SSS pixels.
+        [branch] if (TestLightingForSSS(irradiance2))
-            cachedValue2 = LoadSampleFromVideoMemory(pixelCoord2);
+            viewZ2 = LinearEyeDepth(LOAD_TEXTURE2D(_DepthTexture, pixelCoord2).r, _ZBufferParams);
-        textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = cachedValue2;
+        textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = float4(irradiance2, viewZ2);
    }

    // Wait for the LDS.
    bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD;

 #if SSS_DEBUG_LOD
-    StoreResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0);
+    StoreResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0));
    return;
 #endif

--- a/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassDepthOnly.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassDepthOnly.hlsl
    outColor = float4(0.0, 0.0, 0.0, 0.0);

 #ifdef _DEPTHOFFSET_ON
-    outputDepth = posInput.depthRaw;
+    outputDepth = posInput.deviceDepth;
 #endif
 }
--- a/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassForward.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassForward.hlsl
    }

 #ifdef _DEPTHOFFSET_ON
-    outputDepth = posInput.depthRaw;
+    outputDepth = posInput.deviceDepth;
 #endif

 #ifdef DEBUG_DISPLAY
--- a/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassGBuffer.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/ShaderPass/ShaderPassGBuffer.hlsl
    ENCODE_VELOCITY_INTO_GBUFFER(builtinData.velocity, outVelocityBuffer);

 #ifdef _DEPTHOFFSET_ON
-    outputDepth = posInput.depthRaw;
+    outputDepth = posInput.deviceDepth;
 #endif
 }
--- a/ScriptableRenderPipeline/HDRenderPipeline/Sky/AtmosphericScattering/AtmosphericScattering.hlsl
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Sky/AtmosphericScattering/AtmosphericScattering.hlsl
    else if (_FogColorMode == FOGCOLORMODE_SKY_COLOR)
    {
        // Based on Uncharted 4 "Mip Sky Fog" trick: http://advances.realtimerendering.com/other/2016/naughty_dog/NaughtyDog_TechArt_Final.pdf
-        float mipLevel = (1.0 - _MipFogMaxMip * saturate((posInput.depthVS - _MipFogNear) / (_MipFogFar - _MipFogNear))) * _SkyTextureMipCount;
+        float mipLevel = (1.0 - _MipFogMaxMip * saturate((posInput.linearDepth - _MipFogNear) / (_MipFogFar - _MipFogNear))) * _SkyTextureMipCount;
        float3 dir = normalize(posInput.positionWS - GetPrimaryCameraPosition());
        return SampleSkyTexture(dir, mipLevel).rgb;
    }
    if (_AtmosphericScatteringType == FOGTYPE_EXPONENTIAL)
    {
        float3 fogColor = GetFogColor(posInput);
-        float fogFactor = _ExpFogDensity * (1.0f - Transmittance(OpticalDepthHomogeneous(1.0f / _ExpFogDistance, posInput.depthVS)));
+        float fogFactor = _ExpFogDensity * (1.0f - Transmittance(OpticalDepthHomogeneous(1.0f / _ExpFogDistance, posInput.linearDepth)));
-        float fogFactor = _LinearFogDensity * saturate((posInput.depthVS - _LinearFogStart) * _LinearFogOneOverRange);
+        float fogFactor = _LinearFogDensity * saturate((posInput.linearDepth - _LinearFogStart) * _LinearFogOneOverRange);
        return float4(fogColor, fogFactor);
    }
    else // NONE
--- a/ScriptableRenderPipeline/HDRenderPipeline/Sky/BlacksmithlSky/Resources/SkyBlacksmith.shader
+++ b/ScriptableRenderPipeline/HDRenderPipeline/Sky/BlacksmithlSky/Resources/SkyBlacksmith.shader
                #ifdef PERFORM_SKY_OCCLUSION_TEST
                    // Determine whether the sky is occluded by the scene geometry.
                    // Do not perform blending with the environment map if the sky is occluded.
-                    float depthRaw     = max(_SkyDepth, LOAD_TEXTURE2D(_MainDepthTexture, posInput.unPositionSS).r);
-                    float skyTexWeight = (depthRaw > _SkyDepth) ? 0.0 : 1.0;
+                    float deviceDepth  = max(_SkyDepth, LOAD_TEXTURE2D(_MainDepthTexture, posInput.unPositionSS).r);
+                    float skyTexWeight = (deviceDepth > _SkyDepth) ? 0.0 : 1.0;
-                    float depthRaw     = _SkyDepth;
+                    float deviceDepth  = _SkyDepth;
-                    depthRaw     = _SkyDepth;
+                    deviceDepth  = _SkyDepth;
-                UpdatePositionInput(depthRaw, UNITY_MATRIX_I_VP, k_identity4x4, posInput);
+                UpdatePositionInput(deviceDepth, UNITY_MATRIX_I_VP, k_identity4x4, posInput);

                float4 c1, c2, c3;
                VolundTransferScatter(GetAbsolutePositionWS(posInput.positionWS), c1, c2, c3);