// =============== Convolves transmitted radiance with the Disney diffusion profile ================ //-------------------------------------------------------------------------------------------------- // Definitions //-------------------------------------------------------------------------------------------------- // #pragma enable_d3d11_debug_symbols // Tweak parameters. #define SSS_BILATERAL_FILTER 1 #define SSS_USE_LDS_CACHE 1 #define SSS_ENABLE_NEAR_FIELD 0 #define SSS_SAMPLE_TEST_HTILE 0 #define SSS_USE_TANGENT_PLANE 0 #define SSS_CLAMP_ARTIFACT 0 #define SSS_DEBUG_LOD 0 #define SSS_DEBUG_NORMAL_VS 0 // Do not modify these. #include "../../../ShaderPass/ShaderPass.cs.hlsl" #define SHADERPASS SHADERPASS_SUBSURFACE_SCATTERING #define MILLIMETERS_PER_METER 1000 #define CENTIMETERS_PER_METER 100 #define GROUP_SIZE_1D 16 #define GROUP_SIZE_2D (GROUP_SIZE_1D * GROUP_SIZE_1D) #define TEXTURE_CACHE_BORDER 2 #define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER) //-------------------------------------------------------------------------------------------------- // Included headers //-------------------------------------------------------------------------------------------------- #include "../../../../Core/ShaderLibrary/Packing.hlsl" #include "../../../../Core/ShaderLibrary/SpaceFillingCurves.hlsl" #include "../../../ShaderVariables.hlsl" #define UNITY_MATERIAL_LIT #include "../../../Material/Material.hlsl" #include "../../../Lighting/LightDefinition.cs.hlsl" //-------------------------------------------------------------------------------------------------- // Inputs & outputs //-------------------------------------------------------------------------------------------------- float4 _WorldScales[SSS_N_PROFILES]; // Size of the world unit in meters (only the X component is used) float4 _FilterKernels[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD]; // XY = near field, ZW = far field; 0 = radius, 1 = reciprocal of the PDF DECLARE_GBUFFER_TEXTURE(_GBufferTexture); // Contains the albedo and SSS parameters TEXTURE2D(_DepthTexture); // Z-buffer TEXTURE2D(_StencilTexture); // DXGI_FORMAT_R8_UINT is not supported by Unity TEXTURE2D(_HTile); // DXGI_FORMAT_R8_UINT is not supported by Unity TEXTURE2D(_IrradianceSource); // Includes transmitted light RW_TEXTURE2D(float4, _CameraFilteringTexture); // Target texture //-------------------------------------------------------------------------------------------------- // Implementation //-------------------------------------------------------------------------------------------------- // 6656 bytes used. It appears that the reserved LDS space must be a multiple of 512 bytes. #if SSS_USE_LDS_CACHE groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D * TEXTURE_CACHE_SIZE_1D]; // {irradiance, linearDepth} #endif groupshared bool processGroup; bool StencilTest(int2 pixelCoord, float stencilRef) { bool passedStencilTest; #if SSS_SAMPLE_TEST_HTILE int2 tileCoord = pixelCoord >> 3; // Divide by 8 // Perform the stencil test (reject at the tile rate). passedStencilTest = stencilRef == LOAD_TEXTURE2D(_HTile, tileCoord).r; [branch] if (passedStencilTest) #else // It is extremely uncommon for individual samples to fail the HTile test. // Unfortunately, our copy of HTile does not allow to accept at the tile rate. // Therefore, we choose not to perform the HiS test here. #endif { // Unfortunately, our copy of HTile does not allow to accept at the tile rate. // Therefore, we have to additionally perform the stencil test at the pixel rate. passedStencilTest = stencilRef == UnpackByte(LOAD_TEXTURE2D(_StencilTexture, pixelCoord).r); } return passedStencilTest; } #if SSS_USE_LDS_CACHE float4 LoadSampleFromCacheMemory(int2 cacheCoord) { return textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)]; } #endif float4 LoadSampleFromVideoMemory(int2 pixelCoord) { float3 irradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb; float depth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r; return float4(irradiance, LinearEyeDepth(depth, _ZBufferParams)); } // Returns {irradiance, linearDepth}. float4 LoadSample(int2 pixelCoord, int2 cacheAnchor) { int2 cacheCoord = pixelCoord - cacheAnchor; bool isInCache = max((uint)cacheCoord.x, (uint)cacheCoord.y) < TEXTURE_CACHE_SIZE_1D; #if SSS_USE_LDS_CACHE [branch] if (isInCache) { return LoadSampleFromCacheMemory(cacheCoord); } else #endif { float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING; [branch] if (StencilTest(pixelCoord, stencilRef)) { return LoadSampleFromVideoMemory(pixelCoord); } else { return float4(0, 0, 0, 0); } } } // Computes the value of the integrand over a disk: (2 * PI * r) * KernelVal(). // N.b.: the returned value is multiplied by 4. It is irrelevant due to weight renormalization. float3 KernelValCircle(float r, float3 S) { float3 expOneThird = exp(((-1.0 / 3.0) * r) * S); return /* 0.25 * */ S * (expOneThird + expOneThird * expOneThird * expOneThird); } // Computes F(r)/P(r), s.t. r = sqrt(xy^2 + z^2). // Rescaling of the PDF is handled by 'totalWeight'. float3 ComputeBilateralWeight(float xy2, float z, float mmPerUnit, float3 S, float rcpPdf) { #if (SSS_BILATERAL_FILTER == 0) z = 0; #endif #if SSS_USE_TANGENT_PLANE // Both 'xy2' and 'z' require conversion to millimeters. float r = sqrt(xy2 + z * z) * mmPerUnit; #else // Only 'z' requires conversion to millimeters. float r = sqrt(xy2 + (z * mmPerUnit) * (z * mmPerUnit)); #endif #if SSS_CLAMP_ARTIFACT return saturate(KernelValCircle(r, S) * rcpPdf); #else return KernelValCircle(r, S) * rcpPdf; #endif } void EvaluateSample(uint i, uint n, uint profileID, uint iR, uint iP, float2 centerCoord, int2 cacheAnchor, float3 shapeParam, float3 centerPosVS, float mmPerUnit, float2 pixelsPerMm, float3 tangentX, float3 tangentY, float4x4 projMatrix, inout float3 totalIrradiance, inout float3 totalWeight) { float r = _FilterKernels[profileID][i][iR]; // The relative sample position is known at the compile time. float phi = SampleDiskFibonacci(i, n).y; float2 vec = r * float2(cos(phi), sin(phi)); // Compute the screen-space position and the squared distance (in mm) in the image plane. int2 position; float xy2; #if SSS_USE_TANGENT_PLANE float3 relPosVS = vec.x * tangentX + vec.y * tangentY; float3 positionVS = centerPosVS + relPosVS; float4 positionCS = mul(projMatrix, float4(positionVS, 1)); float2 positionSS = ComputeScreenSpacePosition(positionCS); position = (int2)(positionSS * _ScreenSize.xy); xy2 = dot(relPosVS.xy, relPosVS.xy); #else position = (int2)(centerCoord + vec * pixelsPerMm); xy2 = r * r; #endif float4 textureSample = LoadSample(position, cacheAnchor); float3 irradiance = textureSample.rgb; float linearDepth = textureSample.a; // Check the results of the stencil test. if (linearDepth > 0) { // Apply bilateral weighting. float z = linearDepth - centerPosVS.z; float p = _FilterKernels[profileID][i][iP]; float3 w = ComputeBilateralWeight(xy2, z, mmPerUnit, shapeParam, p); totalIrradiance += w * irradiance; totalWeight += w; } } #pragma kernel SubsurfaceScattering [numthreads(GROUP_SIZE_2D, 1, 1)] void SubsurfaceScattering(uint2 groupId : SV_GroupID, uint groupThreadId : SV_GroupThreadID) { // Note: any factor of 64 is a suitable wave size for our algorithm. uint waveIndex = groupThreadId / 64; uint laneIndex = groupThreadId % 64; uint quadIndex = laneIndex / 4; // Arrange threads in the Morton order to optimally match the memory layout of GCN tiles. uint mortonCode = groupThreadId; uint2 localCoord = DecodeMorton2D(mortonCode); uint2 tileAnchor = groupId * GROUP_SIZE_1D; uint2 pixelCoord = tileAnchor + localCoord; int2 cacheAnchor = (int2)tileAnchor - TEXTURE_CACHE_BORDER; uint2 cacheCoord = localCoord + TEXTURE_CACHE_BORDER; float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING; [branch] if (groupThreadId == 0) { // Check whether the thread group needs to perform any work. float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r; float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r; float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r; float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r; // Perform the stencil test (reject at the tile rate). processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11); } // Wait for the LDS. GroupMemoryBarrierWithGroupSync(); [branch] if (!processGroup) { return; } float3 centerIrradiance = 0; float centerDepth = 0; float4 cachedValue = float4(0, 0, 0, 0); bool passedStencilTest = StencilTest((int2)pixelCoord, stencilRef); [branch] if (passedStencilTest) { centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb; centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r; cachedValue = float4(centerIrradiance, LinearEyeDepth(centerDepth, _ZBufferParams)); } #if SSS_USE_LDS_CACHE // Populate the central region of the LDS cache. textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = cachedValue; uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1; uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4; [branch] if (quadIndex < numBorderQuadsPerWave) { // Fetch another texel into the LDS. uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1); uint2 quadCoord; // The traversal order is such that the quad's X coordinate is monotonically increasing. // Note: the compiler can heavily optimize the code below, as the switch is scalar, // and there are very few unique values due to the symmetry. switch (waveIndex) { case 0: quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1))); quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex)); break; case 1: quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1); quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1))); break; case 2: quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1))); quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1); break; default: // 3 quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1); quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex); break; } uint2 cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1); int2 pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER; float4 cachedValue2 = float4(0, 0, 0, 0); [branch] if (StencilTest(pixelCoord2, stencilRef)) { cachedValue2 = LoadSampleFromVideoMemory(pixelCoord2); } // Populate the border region of the LDS cache. textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = cachedValue2; } // Wait for the LDS. GroupMemoryBarrierWithGroupSync(); #endif bool isOffScreen = pixelCoord.x >= (uint)_ScreenSize.x || pixelCoord.y >= (uint)_ScreenSize.y; [branch] if (!passedStencilTest || isOffScreen) { return; } PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw); float3 unused; // The result of the stencil test allows us to statically determine the material type (SSS). BSDFData bsdfData; FETCH_GBUFFER(gbuffer, _GBufferTexture, pixelCoord); DECODE_FROM_GBUFFER(gbuffer, MATERIALFEATUREFLAGS_LIT_SSS, bsdfData, unused); int profileID = bsdfData.subsurfaceProfile; float distScale = bsdfData.subsurfaceRadius; float3 shapeParam = _ShapeParams[profileID].rgb; float maxDistance = _ShapeParams[profileID].a; // Reconstruct the view-space position corresponding to the central sample. float2 centerPosSS = posInput.positionSS; float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw; float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, _InvProjMatrix); float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, _InvProjMatrix); // Rescaling the filter is equivalent to inversely scaling the world. float mmPerUnit = MILLIMETERS_PER_METER * (_WorldScales[profileID].x / distScale); float unitsPerMm = rcp(mmPerUnit); // Compute the view-space dimensions of the pixel as a quad projected onto geometry. float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy); float2 pixelsPerMm = rcp(unitsPerPixel) * unitsPerMm; // We perform point sampling. Therefore, we can avoid the cost // of filtering if we stay within the bounds of the current pixel. // We use the value of 1 instead of 0.5 as an optimization. // N.b.: our LoD selection algorithm is the same regardless of // whether we integrate over the tangent plane or not, since we // don't want the orientation of the tangent plane to create // divergence of execution across the warp. float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y); [branch] if (distScale == 0 || maxDistInPixels < 1) { #if SSS_DEBUG_LOD _CameraFilteringTexture[pixelCoord] = float4(0, 0, 1, 1); #else _CameraFilteringTexture[pixelCoord] = float4(bsdfData.diffuseColor * centerIrradiance, 1); #endif return; } float4x4 viewMatrix, projMatrix; GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix); // Compute the tangent frame in view space. float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS); float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm; float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm; #if SSS_DEBUG_NORMAL_VS // We expect the view-space normal to be front-facing. if (normalVS.z >= 0) { _CameraFilteringTexture[pixelCoord] = float4(1, 1, 1, 1); return; } #endif // Use more samples for SS regions larger than 5x5 pixels (rotated by 45 degrees). bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD; #if SSS_DEBUG_LOD _CameraFilteringTexture[pixelCoord] = useNearFieldKernel ? float4(1, 0, 0, 1) : float4(0.5, 0.5, 0, 1); return; #endif // Compute the indices used to access the individual components of the float4 of the kernel. uint iR = useNearFieldKernel ? 0 : 2; // radius uint iP = useNearFieldKernel ? 1 : 3; // rcp(pdf) float centerRadius = _FilterKernels[profileID][0][iR]; float centerRcpPdf = _FilterKernels[profileID][0][iP]; float3 centerWeight = KernelValCircle(centerRadius, shapeParam) * centerRcpPdf; // Accumulate filtered irradiance and bilateral weights (for renormalization). float3 totalIrradiance = centerWeight * centerIrradiance; float3 totalWeight = centerWeight; int i, n; // Declare once to avoid the warning from the Unity shader compiler. [unroll] for (i = 1, n = SSS_N_SAMPLES_FAR_FIELD; i < n; i++) { // Integrate over the image or tangent plane in the view space. EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor, shapeParam, centerPosVS, mmPerUnit, pixelsPerMm, tangentX, tangentY, projMatrix, totalIrradiance, totalWeight); } [branch] if (!useNearFieldKernel) { _CameraFilteringTexture[pixelCoord] = float4(bsdfData.diffuseColor * totalIrradiance / totalWeight, 1); return; } [unroll] for (i = SSS_N_SAMPLES_FAR_FIELD, n = SSS_N_SAMPLES_NEAR_FIELD; i < n; i++) { // Integrate over the image or tangent plane in the view space. EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor, shapeParam, centerPosVS, mmPerUnit, pixelsPerMm, tangentX, tangentY, projMatrix, totalIrradiance, totalWeight); } _CameraFilteringTexture[pixelCoord] = float4(bsdfData.diffuseColor * totalIrradiance / totalWeight, 1); }