您最多选择25个主题
主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
458 行
18 KiB
458 行
18 KiB
// =============== Convolves transmitted radiance with the Disney diffusion profile ================
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Definitions
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
// #pragma enable_d3d11_debug_symbols
|
|
|
|
// Tweak parameters.
|
|
#define SSS_BILATERAL_FILTER 1
|
|
#define SSS_USE_LDS_CACHE 1
|
|
#define SSS_ENABLE_NEAR_FIELD 0 // Greatly increases the number of samples. Comes at a high cost.
|
|
#define SSS_SAMPLE_TEST_HTILE 0 // Potential optimization. YMMV.
|
|
#define SSS_USE_TANGENT_PLANE 0 // Improves the accuracy of the approximation(0 -> 1st order). High cost. Does not work with back-facing normals.
|
|
#define SSS_CLAMP_ARTIFACT 0 // Reduces bleeding. Use with SSS_USE_TANGENT_PLANE.
|
|
#define SSS_DEBUG_LOD 0
|
|
#define SSS_DEBUG_NORMAL_VS 0
|
|
|
|
// Do not modify these.
|
|
#include "../../../ShaderPass/ShaderPass.cs.hlsl"
|
|
#define SHADERPASS SHADERPASS_SUBSURFACE_SCATTERING
|
|
#define MILLIMETERS_PER_METER 1000
|
|
#define CENTIMETERS_PER_METER 100
|
|
#define GROUP_SIZE_1D 16
|
|
#define GROUP_SIZE_2D (GROUP_SIZE_1D * GROUP_SIZE_1D)
|
|
#define TEXTURE_CACHE_BORDER 2
|
|
#define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER)
|
|
|
|
// Check for support of typed UAV loads from FORMAT_R16G16B16A16_FLOAT.
|
|
// TODO: query the format support more precisely.
|
|
#if !(defined(SHADER_API_PSSL) || defined(SHADER_API_XBOXONE))
|
|
#define USE_INTERMEDIATE_BUFFER
|
|
#endif
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Included headers
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
#include "../../../../Core/ShaderLibrary/Packing.hlsl"
|
|
#include "../../../../Core/ShaderLibrary/SpaceFillingCurves.hlsl"
|
|
#include "../../../ShaderVariables.hlsl"
|
|
#define UNITY_MATERIAL_LIT
|
|
#include "../../../Material/Material.hlsl"
|
|
#include "../../../Lighting/LightDefinition.cs.hlsl"
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Inputs & outputs
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
float4 _WorldScales[SSS_N_PROFILES]; // Size of the world unit in meters (only the X component is used)
|
|
float4 _FilterKernels[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD]; // XY = near field, ZW = far field; 0 = radius, 1 = reciprocal of the PDF
|
|
|
|
DECLARE_GBUFFER_TEXTURE(_GBufferTexture); // Contains the albedo and SSS parameters
|
|
TEXTURE2D(_DepthTexture); // Z-buffer
|
|
TEXTURE2D(_HTile); // DXGI_FORMAT_R8_UINT is not supported by Unity
|
|
TEXTURE2D(_IrradianceSource); // Includes transmitted light
|
|
|
|
#ifdef USE_INTERMEDIATE_BUFFER
|
|
RW_TEXTURE2D(float4, _CameraFilteringTexture); // Target texture
|
|
#else
|
|
RW_TEXTURE2D(float4, _CameraColorTexture); // Target texture
|
|
#endif
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Implementation
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
// 6656 bytes used. It appears that the reserved LDS space must be a multiple of 512 bytes.
|
|
#if SSS_USE_LDS_CACHE
|
|
groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D * TEXTURE_CACHE_SIZE_1D]; // {irradiance, linearDepth}
|
|
#endif
|
|
groupshared bool processGroup;
|
|
|
|
bool StencilTest(int2 pixelCoord, float stencilRef)
|
|
{
|
|
bool passedStencilTest;
|
|
|
|
#if SSS_SAMPLE_TEST_HTILE
|
|
int2 tileCoord = pixelCoord / 8;
|
|
|
|
// Perform the stencil test (reject at the tile rate).
|
|
passedStencilTest = stencilRef == LOAD_TEXTURE2D(_HTile, tileCoord).r;
|
|
|
|
[branch] if (passedStencilTest)
|
|
#else
|
|
// It is extremely uncommon for individual samples to fail the HTile test.
|
|
// Unfortunately, our copy of HTile does not allow to accept at the tile rate.
|
|
// Therefore, we choose not to perform the HiS test here.
|
|
#endif
|
|
{
|
|
// Unfortunately, our copy of HTile does not allow to accept at the tile rate.
|
|
// Therefore, we have to additionally perform the stencil test at the pixel rate.
|
|
// We check the tagged irradiance buffer to avoid an extra stencil texture fetch.
|
|
passedStencilTest = TestLightingForSSS(LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb);
|
|
}
|
|
|
|
return passedStencilTest;
|
|
}
|
|
|
|
#if SSS_USE_LDS_CACHE
|
|
float4 LoadSampleFromCacheMemory(int2 cacheCoord)
|
|
{
|
|
return textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)];
|
|
}
|
|
#endif
|
|
|
|
float4 LoadSampleFromVideoMemory(int2 pixelCoord)
|
|
{
|
|
float3 irradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
|
|
float depth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
|
|
|
|
return float4(irradiance, LinearEyeDepth(depth, _ZBufferParams));
|
|
}
|
|
|
|
// Returns {irradiance, linearDepth}.
|
|
float4 LoadSample(int2 pixelCoord, int2 cacheAnchor)
|
|
{
|
|
int2 cacheCoord = pixelCoord - cacheAnchor;
|
|
bool isInCache = max((uint)cacheCoord.x, (uint)cacheCoord.y) < TEXTURE_CACHE_SIZE_1D;
|
|
|
|
#if SSS_USE_LDS_CACHE
|
|
[branch] if (isInCache)
|
|
{
|
|
return LoadSampleFromCacheMemory(cacheCoord);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;
|
|
|
|
[branch] if (StencilTest(pixelCoord, stencilRef))
|
|
{
|
|
return LoadSampleFromVideoMemory(pixelCoord);
|
|
}
|
|
else
|
|
{
|
|
return float4(0, 0, 0, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Computes the value of the integrand over a disk: (2 * PI * r) * KernelVal().
|
|
// N.b.: the returned value is multiplied by 4. It is irrelevant due to weight renormalization.
|
|
float3 KernelValCircle(float r, float3 S)
|
|
{
|
|
float3 expOneThird = exp(((-1.0 / 3.0) * r) * S);
|
|
return /* 0.25 * */ S * (expOneThird + expOneThird * expOneThird * expOneThird);
|
|
}
|
|
|
|
// Computes F(r)/P(r), s.t. r = sqrt(xy^2 + z^2).
|
|
// Rescaling of the PDF is handled by 'totalWeight'.
|
|
float3 ComputeBilateralWeight(float xy2, float z, float mmPerUnit, float3 S, float rcpPdf)
|
|
{
|
|
#if (SSS_BILATERAL_FILTER == 0)
|
|
z = 0;
|
|
#endif
|
|
|
|
#if SSS_USE_TANGENT_PLANE
|
|
// Both 'xy2' and 'z' require conversion to millimeters.
|
|
float r = sqrt(xy2 + z * z) * mmPerUnit;
|
|
#else
|
|
// Only 'z' requires conversion to millimeters.
|
|
float r = sqrt(xy2 + (z * mmPerUnit) * (z * mmPerUnit));
|
|
#endif
|
|
|
|
#if SSS_CLAMP_ARTIFACT
|
|
return saturate(KernelValCircle(r, S) * rcpPdf);
|
|
#else
|
|
return KernelValCircle(r, S) * rcpPdf;
|
|
#endif
|
|
}
|
|
|
|
void EvaluateSample(uint i, uint n, uint profileID, uint iR, uint iP, float2 centerCoord, int2 cacheAnchor,
|
|
float3 shapeParam, float3 centerPosVS, float mmPerUnit, float2 pixelsPerMm,
|
|
float3 tangentX, float3 tangentY, float4x4 projMatrix,
|
|
inout float3 totalIrradiance, inout float3 totalWeight)
|
|
{
|
|
float r = _FilterKernels[profileID][i][iR];
|
|
// The relative sample position is known at the compile time.
|
|
float phi = SampleDiskFibonacci(i, n).y;
|
|
float2 vec = r * float2(cos(phi), sin(phi));
|
|
|
|
// Compute the screen-space position and the squared distance (in mm) in the image plane.
|
|
int2 position; float xy2;
|
|
|
|
#if SSS_USE_TANGENT_PLANE
|
|
float3 relPosVS = vec.x * tangentX + vec.y * tangentY;
|
|
float3 positionVS = centerPosVS + relPosVS;
|
|
float4 positionCS = mul(projMatrix, float4(positionVS, 1));
|
|
float2 positionSS = ComputeScreenSpacePosition(positionCS);
|
|
|
|
position = (int2)(positionSS * _ScreenSize.xy);
|
|
xy2 = dot(relPosVS.xy, relPosVS.xy);
|
|
#else
|
|
position = (int2)(centerCoord + vec * pixelsPerMm);
|
|
xy2 = r * r;
|
|
#endif
|
|
|
|
float4 textureSample = LoadSample(position, cacheAnchor);
|
|
float3 irradiance = textureSample.rgb;
|
|
|
|
// Check the results of the stencil test.
|
|
if (TestLightingForSSS(irradiance))
|
|
{
|
|
// Apply bilateral weighting.
|
|
float linearDepth = textureSample.a;
|
|
float z = linearDepth - centerPosVS.z;
|
|
float p = _FilterKernels[profileID][i][iP];
|
|
float3 w = ComputeBilateralWeight(xy2, z, mmPerUnit, shapeParam, p);
|
|
|
|
totalIrradiance += w * irradiance;
|
|
totalWeight += w;
|
|
}
|
|
else
|
|
{
|
|
// The irradiance is 0. This could happen for 2 reasons.
|
|
// Most likely, the surface fragment does not have an SSS material.
|
|
// Alternatively, our sample comes from a region without any geometry.
|
|
// Our blur is energy-preserving, so 'centerWeight' should be set to 0.
|
|
// We do not terminate the loop since we want to gather the contribution
|
|
// of the remaining samples (e.g. in case of hair covering skin).
|
|
}
|
|
}
|
|
|
|
void WriteResult(uint2 pixelCoord, float3 irradiance)
|
|
{
|
|
#ifdef USE_INTERMEDIATE_BUFFER
|
|
_CameraFilteringTexture[pixelCoord] = float4(irradiance, 1);
|
|
#else
|
|
_CameraColorTexture[pixelCoord] += float4(irradiance, 0);
|
|
#endif
|
|
}
|
|
|
|
#pragma kernel SubsurfaceScattering
|
|
|
|
[numthreads(GROUP_SIZE_2D, 1, 1)]
|
|
void SubsurfaceScattering(uint2 groupId : SV_GroupID,
|
|
uint groupThreadId : SV_GroupThreadID)
|
|
{
|
|
// Note: any factor of 64 is a suitable wave size for our algorithm.
|
|
uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
|
|
uint laneIndex = groupThreadId % 64;
|
|
uint quadIndex = laneIndex / 4;
|
|
|
|
// Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.
|
|
uint mortonCode = groupThreadId;
|
|
uint2 localCoord = DecodeMorton2D(mortonCode);
|
|
uint2 tileAnchor = groupId * GROUP_SIZE_1D;
|
|
uint2 pixelCoord = tileAnchor + localCoord;
|
|
int2 cacheAnchor = (int2)tileAnchor - TEXTURE_CACHE_BORDER;
|
|
uint2 cacheCoord = localCoord + TEXTURE_CACHE_BORDER;
|
|
float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;
|
|
|
|
[branch] if (groupThreadId == 0)
|
|
{
|
|
// Check whether the thread group needs to perform any work.
|
|
float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;
|
|
float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;
|
|
float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;
|
|
float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;
|
|
|
|
// Perform the stencil test (reject at the tile rate).
|
|
processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);
|
|
}
|
|
|
|
// Wait for the LDS.
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
[branch] if (!processGroup) { return; }
|
|
|
|
float3 centerIrradiance = 0;
|
|
float centerDepth = 0;
|
|
float4 cachedValue = 0;
|
|
|
|
bool passedStencilTest = StencilTest((int2)pixelCoord, stencilRef);
|
|
|
|
[branch] if (passedStencilTest)
|
|
{
|
|
centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
|
|
centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
|
|
cachedValue = float4(centerIrradiance, LinearEyeDepth(centerDepth, _ZBufferParams));
|
|
}
|
|
|
|
#if SSS_USE_LDS_CACHE
|
|
// Populate the central region of the LDS cache.
|
|
textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = cachedValue;
|
|
|
|
uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;
|
|
uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;
|
|
|
|
[branch] if (quadIndex < numBorderQuadsPerWave)
|
|
{
|
|
// Fetch another texel into the LDS.
|
|
uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1);
|
|
|
|
uint2 quadCoord;
|
|
|
|
// The traversal order is such that the quad's X coordinate is monotonically increasing.
|
|
// Note: the compiler can heavily optimize the code below, as the switch is scalar,
|
|
// and there are very few unique values due to the symmetry.
|
|
switch (waveIndex)
|
|
{
|
|
case 0:
|
|
quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex));
|
|
break;
|
|
case 1:
|
|
quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
break;
|
|
case 2:
|
|
quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
break;
|
|
default: // 3
|
|
quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex);
|
|
break;
|
|
}
|
|
|
|
uint2 cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
|
|
int2 pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
|
|
float4 cachedValue2 = 0;
|
|
|
|
[branch] if (StencilTest(pixelCoord2, stencilRef))
|
|
{
|
|
cachedValue2 = LoadSampleFromVideoMemory(pixelCoord2);
|
|
}
|
|
|
|
// Populate the border region of the LDS cache.
|
|
textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = cachedValue2;
|
|
}
|
|
|
|
// Wait for the LDS.
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
bool isOffScreen = pixelCoord.x >= (uint)_ScreenSize.x || pixelCoord.y >= (uint)_ScreenSize.y;
|
|
|
|
[branch] if (!passedStencilTest || isOffScreen) { return; }
|
|
|
|
PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw);
|
|
|
|
float3 unused;
|
|
|
|
// The result of the stencil test allows us to statically determine the material type (SSS).
|
|
BSDFData bsdfData;
|
|
FETCH_GBUFFER(gbuffer, _GBufferTexture, pixelCoord);
|
|
DECODE_FROM_GBUFFER(gbuffer, MATERIALFEATUREFLAGS_LIT_SSS, bsdfData, unused);
|
|
|
|
int profileID = bsdfData.subsurfaceProfile;
|
|
float distScale = bsdfData.subsurfaceRadius;
|
|
float3 shapeParam = _ShapeParams[profileID].rgb;
|
|
float maxDistance = _ShapeParams[profileID].a;
|
|
|
|
// Reconstruct the view-space position corresponding to the central sample.
|
|
float2 centerPosSS = posInput.positionSS;
|
|
float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw;
|
|
float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, _InvProjMatrix);
|
|
float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, _InvProjMatrix);
|
|
|
|
// Rescaling the filter is equivalent to inversely scaling the world.
|
|
float mmPerUnit = MILLIMETERS_PER_METER * (_WorldScales[profileID].x / distScale);
|
|
float unitsPerMm = rcp(mmPerUnit);
|
|
|
|
// Compute the view-space dimensions of the pixel as a quad projected onto geometry.
|
|
float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
|
|
float2 pixelsPerMm = rcp(unitsPerPixel) * unitsPerMm;
|
|
|
|
// We perform point sampling. Therefore, we can avoid the cost
|
|
// of filtering if we stay within the bounds of the current pixel.
|
|
// We use the value of 1 instead of 0.5 as an optimization.
|
|
// N.b.: our LoD selection algorithm is the same regardless of
|
|
// whether we integrate over the tangent plane or not, since we
|
|
// don't want the orientation of the tangent plane to create
|
|
// divergence of execution across the warp.
|
|
float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);
|
|
|
|
float3 albedo = ApplyDiffuseTexturingMode(bsdfData);
|
|
|
|
[branch] if (distScale == 0 || maxDistInPixels < 1)
|
|
{
|
|
#if SSS_DEBUG_LOD
|
|
WriteResult(pixelCoord, float3(0, 0, 1));
|
|
#else
|
|
WriteResult(pixelCoord, albedo * centerIrradiance);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
float4x4 viewMatrix, projMatrix;
|
|
GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);
|
|
|
|
// Compute the tangent frame in view space.
|
|
float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);
|
|
float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;
|
|
float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;
|
|
|
|
#if SSS_DEBUG_NORMAL_VS
|
|
// We expect the normal to be front-facing.
|
|
float3 viewDirVS = normalize(centerPosVS);
|
|
if (dot(normalVS, viewDirVS) >= 0)
|
|
{
|
|
WriteResult(pixelCoord, float3(1, 1, 1));
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
// Use more samples for SS regions larger than 5x5 pixels (rotated by 45 degrees).
|
|
bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD;
|
|
|
|
#if SSS_DEBUG_LOD
|
|
WriteResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0);
|
|
return;
|
|
#endif
|
|
|
|
// Compute the indices used to access the individual components of the float4 of the kernel.
|
|
uint iR = useNearFieldKernel ? 0 : 2; // radius
|
|
uint iP = useNearFieldKernel ? 1 : 3; // rcp(pdf)
|
|
|
|
float centerRadius = _FilterKernels[profileID][0][iR];
|
|
float centerRcpPdf = _FilterKernels[profileID][0][iP];
|
|
float3 centerWeight = KernelValCircle(centerRadius, shapeParam) * centerRcpPdf;
|
|
|
|
// Accumulate filtered irradiance and bilateral weights (for renormalization).
|
|
float3 totalIrradiance = centerWeight * centerIrradiance;
|
|
float3 totalWeight = centerWeight;
|
|
|
|
int i, n; // Declare once to avoid the warning from the Unity shader compiler.
|
|
|
|
[unroll]
|
|
for (i = 1, n = SSS_N_SAMPLES_FAR_FIELD; i < n; i++)
|
|
{
|
|
// Integrate over the image or tangent plane in the view space.
|
|
EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor,
|
|
shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
|
|
tangentX, tangentY, projMatrix,
|
|
totalIrradiance, totalWeight);
|
|
}
|
|
|
|
[branch] if (!useNearFieldKernel)
|
|
{
|
|
WriteResult(pixelCoord, albedo * totalIrradiance / totalWeight);
|
|
return;
|
|
}
|
|
|
|
[unroll]
|
|
for (i = SSS_N_SAMPLES_FAR_FIELD, n = SSS_N_SAMPLES_NEAR_FIELD; i < n; i++)
|
|
{
|
|
// Integrate over the image or tangent plane in the view space.
|
|
EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheAnchor,
|
|
shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
|
|
tangentX, tangentY, projMatrix,
|
|
totalIrradiance, totalWeight);
|
|
}
|
|
|
|
WriteResult(pixelCoord, albedo * totalIrradiance / totalWeight);
|
|
}
|