您最多选择25个主题
主题必须以中文或者字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
442 行
18 KiB
442 行
18 KiB
// =============== Convolves transmitted radiance with the Disney diffusion profile ================
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Definitions
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
// #pragma enable_d3d11_debug_symbols
|
|
|
|
// Tweak parameters.
|
|
#define SSS_BILATERAL_FILTER 1
|
|
#define SSS_USE_LDS_CACHE 1
|
|
#define SSS_ENABLE_NEAR_FIELD 0 // Greatly increases the number of samples. Comes at a high cost.
|
|
#define SSS_SAMPLE_TEST_HTILE 0 // Potential optimization. YMMV.
|
|
#define SSS_USE_TANGENT_PLANE 0 // Improves the accuracy of the approximation(0 -> 1st order). High cost. Does not work with back-facing normals.
|
|
#define SSS_CLAMP_ARTIFACT 0 // Reduces bleeding. Use with SSS_USE_TANGENT_PLANE.
|
|
#define SSS_DEBUG_LOD 0
|
|
#define SSS_DEBUG_NORMAL_VS 0
|
|
|
|
// Do not modify these.
|
|
#include "../../../ShaderPass/ShaderPass.cs.hlsl"
|
|
#define SHADERPASS SHADERPASS_SUBSURFACE_SCATTERING
|
|
#define GROUP_SIZE_1D 16
|
|
#define GROUP_SIZE_2D (GROUP_SIZE_1D * GROUP_SIZE_1D)
|
|
#define TEXTURE_CACHE_BORDER 2
|
|
#define TEXTURE_CACHE_SIZE_1D (GROUP_SIZE_1D + 2 * TEXTURE_CACHE_BORDER)
|
|
|
|
// Check for support of typed UAV loads from FORMAT_R16G16B16A16_FLOAT.
|
|
// TODO: query the format support more precisely.
|
|
#if !(defined(SHADER_API_PSSL) || defined(SHADER_API_XBOXONE))
|
|
#define USE_INTERMEDIATE_BUFFER
|
|
#endif
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Included headers
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
#include "ShaderLibrary/Common.hlsl"
|
|
#include "ShaderLibrary/Packing.hlsl"
|
|
#include "ShaderLibrary\Fibonacci.hlsl"
|
|
#include "ShaderLibrary/SpaceFillingCurves.hlsl"
|
|
#include "../../../ShaderVariables.hlsl"
|
|
#include "../../../Lighting/LightDefinition.cs.hlsl"
|
|
#include "../../SubsurfaceScattering/SubsurfaceScattering.hlsl"
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Inputs & outputs
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
float4 _FilterKernels[SSS_N_PROFILES][SSS_N_SAMPLES_NEAR_FIELD]; // XY = near field, ZW = far field; 0 = radius, 1 = reciprocal of the PDF
|
|
|
|
TEXTURE2D(_DepthTexture); // Z-buffer
|
|
TEXTURE2D(_HTile); // DXGI_FORMAT_R8_UINT is not supported by Unity
|
|
TEXTURE2D(_IrradianceSource); // Includes transmitted light
|
|
|
|
#ifdef USE_INTERMEDIATE_BUFFER
|
|
RW_TEXTURE2D(float4, _CameraFilteringTexture); // Target texture
|
|
#else
|
|
RW_TEXTURE2D(float4, _CameraColorTexture); // Target texture
|
|
#endif
|
|
|
|
//--------------------------------------------------------------------------------------------------
|
|
// Implementation
|
|
//--------------------------------------------------------------------------------------------------
|
|
|
|
// 6656 bytes used. It appears that the reserved LDS space must be a multiple of 512 bytes.
|
|
#if SSS_USE_LDS_CACHE
|
|
groupshared float4 textureCache[TEXTURE_CACHE_SIZE_1D * TEXTURE_CACHE_SIZE_1D]; // {irradiance, linearDepth}
|
|
#endif
|
|
groupshared bool processGroup;
|
|
|
|
#if SSS_USE_LDS_CACHE
|
|
float4 LoadSampleFromCacheMemory(int2 cacheCoord)
|
|
{
|
|
return textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)];
|
|
}
|
|
#endif
|
|
|
|
float4 LoadSampleFromVideoMemory(int2 pixelCoord)
|
|
{
|
|
float3 irradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
|
|
float depth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
|
|
|
|
return float4(irradiance, LinearEyeDepth(depth, _ZBufferParams));
|
|
}
|
|
|
|
// Returns {irradiance, linearDepth}.
|
|
float4 LoadSample(int2 pixelCoord, int2 cacheOffset)
|
|
{
|
|
#if SSS_USE_LDS_CACHE
|
|
int2 cacheCoord = pixelCoord - cacheOffset;
|
|
bool isInCache = max((uint)cacheCoord.x, (uint)cacheCoord.y) < TEXTURE_CACHE_SIZE_1D;
|
|
|
|
[branch] if (isInCache)
|
|
{
|
|
return LoadSampleFromCacheMemory(cacheCoord);
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
// Always load both irradiance and depth.
|
|
// Avoid dependent texture reads at the cost of extra bandwidth.
|
|
return LoadSampleFromVideoMemory(pixelCoord);
|
|
}
|
|
}
|
|
|
|
// Computes the value of the integrand in polar coordinates: f(r, s) = r * R(r, s).
|
|
// f(r, s) = (Exp[-r * s] + Exp[-r * s / 3]) * (s / (8 * Pi))
|
|
// We can drop the constant (s / (8 * Pi)) due to the subsequent weight renormalization.
|
|
float3 DisneyProfilePolar(float r, float3 S)
|
|
{
|
|
#if 0
|
|
float3 expOneThird = exp(((-1.0 / 3.0) * r) * S);
|
|
#else
|
|
// Help the compiler.
|
|
float k = (-1.0 / 3.0) * LOG2_E;
|
|
float3 p = (k * r) * S;
|
|
float3 expOneThird = exp2(p);
|
|
#endif
|
|
return expOneThird + expOneThird * expOneThird * expOneThird;
|
|
}
|
|
|
|
// Computes f(r, s)/p(r, s), s.t. r = sqrt(xy^2 + z^2).
|
|
// Rescaling of the PDF is handled by 'totalWeight'.
|
|
float3 ComputeBilateralWeight(float xy2, float z, float mmPerUnit, float3 S, float rcpPdf)
|
|
{
|
|
#if (SSS_BILATERAL_FILTER == 0)
|
|
z = 0;
|
|
#endif
|
|
|
|
#if SSS_USE_TANGENT_PLANE
|
|
// Both 'xy2' and 'z' require conversion to millimeters.
|
|
float r = sqrt(xy2 + z * z) * mmPerUnit;
|
|
#else
|
|
// Only 'z' requires conversion to millimeters.
|
|
float r = sqrt(xy2 + (z * mmPerUnit) * (z * mmPerUnit));
|
|
#endif
|
|
|
|
#if SSS_CLAMP_ARTIFACT
|
|
return saturate(DisneyProfilePolar(r, S) * rcpPdf);
|
|
#else
|
|
return DisneyProfilePolar(r, S) * rcpPdf;
|
|
#endif
|
|
}
|
|
|
|
void EvaluateSample(uint i, uint n, uint profileID, uint iR, uint iP, float2 centerCoord, int2 cacheOffset,
|
|
float3 shapeParam, float3 centerPosVS, float mmPerUnit, float2 pixelsPerMm,
|
|
float3 tangentX, float3 tangentY, float4x4 projMatrix,
|
|
inout float3 totalIrradiance, inout float3 totalWeight)
|
|
{
|
|
float r = _FilterKernels[profileID][i][iR];
|
|
// The relative sample position is known at the compile time.
|
|
float phi = SampleDiskFibonacci(i, n).y;
|
|
float2 vec = r * float2(cos(phi), sin(phi));
|
|
|
|
// Compute the screen-space position and the squared distance (in mm) in the image plane.
|
|
int2 position; float xy2;
|
|
|
|
#if SSS_USE_TANGENT_PLANE
|
|
float3 relPosVS = vec.x * tangentX + vec.y * tangentY;
|
|
float3 positionVS = centerPosVS + relPosVS;
|
|
float2 positionNDC = ComputeNormalizedDeviceCoordinates(positionCS, projMatrix);
|
|
|
|
position = (int2)(positionNDC * _ScreenSize.xy);
|
|
xy2 = dot(relPosVS.xy, relPosVS.xy);
|
|
#else
|
|
position = (int2)(centerCoord + vec * pixelsPerMm);
|
|
xy2 = r * r;
|
|
#endif
|
|
|
|
float4 textureSample = LoadSample(position, cacheOffset);
|
|
float3 irradiance = textureSample.rgb;
|
|
|
|
// Check the results of the stencil test.
|
|
if (TestLightingForSSS(irradiance))
|
|
{
|
|
// Apply bilateral weighting.
|
|
float viewZ = textureSample.a;
|
|
float relZ = viewZ - centerPosVS.z;
|
|
float rcpPdf = _FilterKernels[profileID][i][iP];
|
|
float3 weight = ComputeBilateralWeight(xy2, relZ, mmPerUnit, shapeParam, rcpPdf);
|
|
|
|
// Note: if the texture sample if off-screen, (z = 0) -> (viewZ = far) -> (weight ≈ 0).
|
|
totalIrradiance += weight * irradiance;
|
|
totalWeight += weight;
|
|
}
|
|
else
|
|
{
|
|
// The irradiance is 0. This could happen for 2 reasons.
|
|
// Most likely, the surface fragment does not have an SSS material.
|
|
// Alternatively, our sample comes from a region without any geometry.
|
|
// Our blur is energy-preserving, so 'centerWeight' should be set to 0.
|
|
// We do not terminate the loop since we want to gather the contribution
|
|
// of the remaining samples (e.g. in case of hair covering skin).
|
|
}
|
|
}
|
|
|
|
void StoreResult(uint2 pixelCoord, float3 irradiance)
|
|
{
|
|
#ifdef USE_INTERMEDIATE_BUFFER
|
|
_CameraFilteringTexture[pixelCoord] = float4(irradiance, 1);
|
|
#else
|
|
_CameraColorTexture[pixelCoord] += float4(irradiance, 0);
|
|
#endif
|
|
}
|
|
|
|
#pragma kernel SubsurfaceScattering
|
|
|
|
[numthreads(GROUP_SIZE_2D, 1, 1)]
|
|
void SubsurfaceScattering(uint3 reorderedGroupId : SV_GroupID,
|
|
uint groupThreadId : SV_GroupThreadID)
|
|
{
|
|
// Note: any factor of 64 is a suitable wave size for our algorithm.
|
|
uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
|
|
uint laneIndex = groupThreadId % 64;
|
|
uint quadIndex = laneIndex / 4;
|
|
|
|
// We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
|
|
// Therefore, we need to reorder. TODO: macrotile order.
|
|
uint2 groupQuad = DeinterleaveQuad(reorderedGroupId.x);
|
|
uint2 groupId = uint2(reorderedGroupId.y * 2 + groupQuad.x, reorderedGroupId.z * 2 + groupQuad.y);
|
|
|
|
// Arrange threads in the Morton order to optimally match the memory layout of GCN tiles.
|
|
uint mortonCode = groupThreadId;
|
|
uint2 groupCoord = DecodeMorton2D(mortonCode);
|
|
uint2 groupOffset = groupId * GROUP_SIZE_1D;
|
|
uint2 pixelCoord = groupOffset + groupCoord;
|
|
int2 cacheOffset = (int2)groupOffset - TEXTURE_CACHE_BORDER;
|
|
|
|
[branch] if (groupThreadId == 0)
|
|
{
|
|
float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;
|
|
|
|
// Check whether the thread group needs to perform any work.
|
|
float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;
|
|
float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;
|
|
float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;
|
|
float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;
|
|
|
|
// Perform the stencil test (reject at the tile rate).
|
|
processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);
|
|
}
|
|
|
|
// Wait for the LDS.
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
[branch] if (!processGroup) { return; }
|
|
|
|
float3 centerIrradiance = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord).rgb;
|
|
float centerDepth = 0;
|
|
float centerViewZ = 0;
|
|
bool passedStencilTest = TestLightingForSSS(centerIrradiance);
|
|
|
|
// Save some bandwidth by only loading depth values for SSS pixels.
|
|
[branch] if (passedStencilTest)
|
|
{
|
|
centerDepth = LOAD_TEXTURE2D(_DepthTexture, pixelCoord).r;
|
|
centerViewZ = LinearEyeDepth(centerDepth, _ZBufferParams);
|
|
}
|
|
|
|
#if SSS_USE_LDS_CACHE
|
|
uint2 cacheCoord = groupCoord + TEXTURE_CACHE_BORDER;
|
|
// Populate the central region of the LDS cache.
|
|
textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord.y, cacheCoord.x)] = float4(centerIrradiance, centerViewZ);
|
|
|
|
uint numBorderQuadsPerWave = TEXTURE_CACHE_SIZE_1D / 2 - 1;
|
|
uint halfCacheWidthInQuads = TEXTURE_CACHE_SIZE_1D / 4;
|
|
|
|
[branch] if (quadIndex < numBorderQuadsPerWave)
|
|
{
|
|
// Fetch another texel into the LDS.
|
|
uint2 startQuad = halfCacheWidthInQuads * DeinterleaveQuad(waveIndex);
|
|
|
|
uint2 quadCoord;
|
|
|
|
// The traversal order is such that the quad's X coordinate is monotonically increasing.
|
|
// The corner is always the near the block of the corresponding wavefront.
|
|
// Note: the compiler can heavily optimize the code below, as the switch is scalar,
|
|
// and there are very few unique values due to the symmetry.
|
|
switch (waveIndex)
|
|
{
|
|
case 0: // Bottom left
|
|
quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
quadCoord.y = max(0, (int)((halfCacheWidthInQuads - 1) - quadIndex));
|
|
break;
|
|
case 1: // Bottom right
|
|
quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
quadCoord.y = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
break;
|
|
case 2: // Top left
|
|
quadCoord.x = max(0, (int)(quadIndex - (halfCacheWidthInQuads - 1)));
|
|
quadCoord.y = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
break;
|
|
default: // Top right
|
|
quadCoord.x = min(quadIndex, halfCacheWidthInQuads - 1);
|
|
quadCoord.y = min(halfCacheWidthInQuads - 1, 2 * (halfCacheWidthInQuads - 1) - quadIndex);
|
|
break;
|
|
}
|
|
|
|
uint2 cacheCoord2 = 2 * (startQuad + quadCoord) + DeinterleaveQuad(laneIndex);
|
|
int2 pixelCoord2 = (int2)(groupOffset + cacheCoord2) - TEXTURE_CACHE_BORDER;
|
|
float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
|
|
float viewZ2 = 0;
|
|
|
|
// Save some bandwidth by only loading depth values for SSS pixels.
|
|
[branch] if (TestLightingForSSS(irradiance2))
|
|
{
|
|
viewZ2 = LinearEyeDepth(LOAD_TEXTURE2D(_DepthTexture, pixelCoord2).r, _ZBufferParams);
|
|
}
|
|
|
|
// Populate the border region of the LDS cache.
|
|
textureCache[Mad24(TEXTURE_CACHE_SIZE_1D, cacheCoord2.y, cacheCoord2.x)] = float4(irradiance2, viewZ2);
|
|
}
|
|
|
|
// Wait for the LDS.
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
[branch] if (!passedStencilTest) { return; }
|
|
|
|
PositionInputs posInput = GetPositionInput(pixelCoord, _ScreenSize.zw);
|
|
|
|
// The result of the stencil test allows us to statically determine the material type (SSS).
|
|
SSSData sssData;
|
|
DECODE_FROM_SSSBUFFER(posInput.positionSS, sssData);
|
|
|
|
int profileID = sssData.subsurfaceProfile;
|
|
float distScale = sssData.subsurfaceRadius;
|
|
float3 shapeParam = _ShapeParams[profileID].rgb;
|
|
float maxDistance = _ShapeParams[profileID].a;
|
|
|
|
// Reconstruct the view-space position corresponding to the central sample.
|
|
float2 centerPosSS = posInput.positionNDC;
|
|
float2 cornerPosSS = centerPosSS + 0.5 * _ScreenSize.zw;
|
|
float3 centerPosVS = ComputeViewSpacePosition(centerPosSS, centerDepth, UNITY_MATRIX_I_P);
|
|
float3 cornerPosVS = ComputeViewSpacePosition(cornerPosSS, centerDepth, UNITY_MATRIX_I_P);
|
|
|
|
// Rescaling the filter is equivalent to inversely scaling the world.
|
|
float mmPerUnit = MILLIMETERS_PER_METER * (_WorldScales[profileID].x / distScale);
|
|
float unitsPerMm = rcp(mmPerUnit);
|
|
|
|
// Compute the view-space dimensions of the pixel as a quad projected onto geometry.
|
|
float2 unitsPerPixel = 2 * abs(cornerPosVS.xy - centerPosVS.xy);
|
|
float2 pixelsPerMm = rcp(unitsPerPixel) * unitsPerMm;
|
|
|
|
// We perform point sampling. Therefore, we can avoid the cost
|
|
// of filtering if we stay within the bounds of the current pixel.
|
|
// We use the value of 1 instead of 0.5 as an optimization.
|
|
// N.b.: our LoD selection algorithm is the same regardless of
|
|
// whether we integrate over the tangent plane or not, since we
|
|
// don't want the orientation of the tangent plane to create
|
|
// divergence of execution across the warp.
|
|
float maxDistInPixels = maxDistance * max(pixelsPerMm.x, pixelsPerMm.y);
|
|
|
|
float3 albedo = ApplyDiffuseTexturingMode(sssData.diffuseColor, profileID);
|
|
|
|
[branch] if (distScale == 0 || maxDistInPixels < 1)
|
|
{
|
|
#if SSS_DEBUG_LOD
|
|
StoreResult(pixelCoord, float3(0, 0, 1));
|
|
#else
|
|
StoreResult(pixelCoord, albedo * centerIrradiance);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
float4x4 viewMatrix, projMatrix;
|
|
GetLeftHandedViewSpaceMatrices(viewMatrix, projMatrix);
|
|
|
|
// TODO: Since we have moved to forward SSS, we don't support anymore a bsdfData.normalWS.
|
|
// Once we include normal+roughness rendering during the prepass, we will have a buffer to bind here and we will be able to reuse this part of the algorithm on demand.
|
|
#if SSS_USE_TANGENT_PLANE
|
|
#error ThisWillNotCompile_SeeComment
|
|
// Compute the tangent frame in view space.
|
|
float3 normalVS = mul((float3x3)viewMatrix, bsdfData.normalWS);
|
|
float3 tangentX = GetLocalFrame(normalVS)[0] * unitsPerMm;
|
|
float3 tangentY = GetLocalFrame(normalVS)[1] * unitsPerMm;
|
|
#else
|
|
float3 normalVS = float3(0, 0, 0);
|
|
float3 tangentX = float3(0, 0, 0);
|
|
float3 tangentY = float3(0, 0, 0);
|
|
#endif
|
|
|
|
#if SSS_DEBUG_NORMAL_VS
|
|
// We expect the normal to be front-facing.
|
|
float3 viewDirVS = normalize(centerPosVS);
|
|
if (dot(normalVS, viewDirVS) >= 0)
|
|
{
|
|
StoreResult(pixelCoord, float3(1, 1, 1));
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
// Use more samples for SS regions larger than 5x5 pixels (rotated by 45 degrees).
|
|
bool useNearFieldKernel = SSS_ENABLE_NEAR_FIELD && maxDistInPixels > SSS_LOD_THRESHOLD;
|
|
|
|
#if SSS_DEBUG_LOD
|
|
StoreResult(pixelCoord, useNearFieldKernel ? float3(1, 0, 0) : float3(0.5, 0.5, 0));
|
|
return;
|
|
#endif
|
|
|
|
// Compute the indices used to access the individual components of the float4 of the kernel.
|
|
uint iR = useNearFieldKernel ? 0 : 2; // radius
|
|
uint iP = useNearFieldKernel ? 1 : 3; // rcp(pdf)
|
|
|
|
float centerRadius = _FilterKernels[profileID][0][iR];
|
|
float centerRcpPdf = _FilterKernels[profileID][0][iP];
|
|
float3 centerWeight = DisneyProfilePolar(centerRadius, shapeParam) * centerRcpPdf;
|
|
|
|
// Accumulate filtered irradiance and bilateral weights (for renormalization).
|
|
float3 totalIrradiance = centerWeight * centerIrradiance;
|
|
float3 totalWeight = centerWeight;
|
|
|
|
int i, n; // Declare once to avoid the warning from the Unity shader compiler.
|
|
|
|
[unroll]
|
|
for (i = 1, n = SSS_N_SAMPLES_FAR_FIELD; i < n; i++)
|
|
{
|
|
// Integrate over the image or tangent plane in the view space.
|
|
EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheOffset,
|
|
shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
|
|
tangentX, tangentY, projMatrix,
|
|
totalIrradiance, totalWeight);
|
|
}
|
|
|
|
[branch] if (!useNearFieldKernel)
|
|
{
|
|
StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);
|
|
return;
|
|
}
|
|
|
|
[unroll]
|
|
for (i = SSS_N_SAMPLES_FAR_FIELD, n = SSS_N_SAMPLES_NEAR_FIELD; i < n; i++)
|
|
{
|
|
// Integrate over the image or tangent plane in the view space.
|
|
EvaluateSample(i, n, profileID, iR, iP, pixelCoord + 0.5, cacheOffset,
|
|
shapeParam, centerPosVS, mmPerUnit, pixelsPerMm,
|
|
tangentX, tangentY, projMatrix,
|
|
totalIrradiance, totalWeight);
|
|
}
|
|
|
|
StoreResult(pixelCoord, albedo * totalIrradiance / totalWeight);
|
|
}
|