浏览代码

Dispatch the SSS CS in 4x swizzled 16x16 groups per a 32x32 macrotile

/asmdef
Evgenii Golubev 7 年前
当前提交
811c0769
共有 3 个文件被更改,包括 58 次插入41 次删除
  1. 66
      ScriptableRenderPipeline/Core/ShaderLibrary/SpaceFillingCurves.hlsl
  2. 6
      ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs
  3. 27
      ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute

66
ScriptableRenderPipeline/Core/ShaderLibrary/SpaceFillingCurves.hlsl


// Ref: https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
uint Part1By1(uint x)
{
x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
return x;
x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
return x;
}
// "Insert" two 0 bits after each of the 10 low bits of x/

x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
// Inverse of Part1By1 - "delete" all odd-indexed bits/

x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return x;
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return x;
}
// Inverse of Part1By2 - "delete" all bits not at positions divisible by 3/

x &= 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
return x;
x &= 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
return x;
return (Part1By1(coord.y) << 1) + Part1By1(coord.x);
return (Part1By1(coord.y) << 1) + Part1By1(coord.x);
return (Part1By2(coord.z) << 2) + (Part1By2(coord.y) << 1) + Part1By2(coord.x);
return (Part1By2(coord.z) << 2) + (Part1By2(coord.y) << 1) + Part1By2(coord.x);
return uint2(Compact1By1(code >> 0), Compact1By1(code >> 1));
return uint2(Compact1By1(code >> 0), Compact1By1(code >> 1));
return uint3(Compact1By2(code >> 0), Compact1By2(code >> 1), Compact1By2(code >> 2));
return uint3(Compact1By2(code >> 0), Compact1By2(code >> 1), Compact1By2(code >> 2));
}
uint InterleaveQuad(uint2 quad)
{
return quad.x + 2 * quad.y;
}
uint2 DeinterleaveQuad(uint code)
{
return uint2(code & 1, (code >> 1) & 1);
}
#endif // UNITY_SPACE_FILLING_CURVES_INCLUDED

6
ScriptableRenderPipeline/HDRenderPipeline/HDRenderPipeline.cs


cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, HDShaderIDs._CameraFilteringBuffer, m_CameraFilteringBufferRT);
// Perform the SSS filtering pass which fills 'm_CameraFilteringBufferRT'.
cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, ((int)hdCamera.screenSize.x + 15) / 16, ((int)hdCamera.screenSize.y + 15) / 16, 1);
// We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, 4, ((int)hdCamera.screenSize.x + 31) / 32, ((int)hdCamera.screenSize.y + 31) / 32);
cmd.SetGlobalTexture(HDShaderIDs._IrradianceSource, m_CameraFilteringBufferRT); // Cannot set a RT on a material

cmd.SetComputeTextureParam(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, HDShaderIDs._CameraColorTexture, m_CameraColorBufferRT);
// Perform the SSS filtering pass which performs an in-place update of 'm_CameraColorBufferRT'.
cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, ((int)hdCamera.screenSize.x + 15) / 16, ((int)hdCamera.screenSize.y + 15) / 16, 1);
// We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
cmd.DispatchCompute(m_SubsurfaceScatteringCS, m_SubsurfaceScatteringKernel, 4, ((int)hdCamera.screenSize.x + 31) / 32, ((int)hdCamera.screenSize.y + 31) / 32);
}
}
else

27
ScriptableRenderPipeline/HDRenderPipeline/Material/Lit/Resources/SubsurfaceScattering.compute


#pragma kernel SubsurfaceScattering
[numthreads(GROUP_SIZE_2D, 1, 1)]
void SubsurfaceScattering(uint2 groupId : SV_GroupID,
void SubsurfaceScattering(uint3 groupId : SV_GroupID,
// We dispatch 4x swizzled 16x16 groups per a 32x32 macrotile.
// Therefore, we need to unswizzle. TODO: macrotile order.
uint2 groupOffset = DeinterleaveQuad(groupId.x);
uint2 groupCoord = uint2(groupId.y * 2 + groupOffset.x, groupId.z * 2 + groupOffset.y);
// Note: any factor of 64 is a suitable wave size for our algorithm.
uint waveIndex = WaveReadFirstLane(groupThreadId / 64);
uint laneIndex = groupThreadId % 64;

uint mortonCode = groupThreadId;
uint2 localCoord = DecodeMorton2D(mortonCode);
uint2 tileAnchor = groupId * GROUP_SIZE_1D;
uint2 pixelCoord = tileAnchor + localCoord;
int2 cacheAnchor = (int2)tileAnchor - TEXTURE_CACHE_BORDER;
uint2 groupAnchor = groupCoord * GROUP_SIZE_1D;
uint2 pixelCoord = groupAnchor + localCoord;
int2 cacheAnchor = (int2)groupAnchor - TEXTURE_CACHE_BORDER;
uint2 cacheCoord = localCoord + TEXTURE_CACHE_BORDER;
float stencilRef = STENCILLIGHTINGUSAGE_SPLIT_LIGHTING;

float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 0)).r;
float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 0)).r;
float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(0, 1)).r;
float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupId + uint2(1, 1)).r;
float s00 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(0, 0)).r;
float s10 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(1, 0)).r;
float s01 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(0, 1)).r;
float s11 = LOAD_TEXTURE2D(_HTile, 2 * groupCoord + uint2(1, 1)).r;
// Perform the stencil test (reject at the tile rate).
processGroup = (stencilRef == s00 || stencilRef == s10 || stencilRef == s01 || stencilRef == s11);

[branch] if (quadIndex < numBorderQuadsPerWave)
{
// Fetch another texel into the LDS.
uint2 startQuad = halfCacheWidthInQuads * uint2(waveIndex & 1, waveIndex >> 1);
uint2 startQuad = halfCacheWidthInQuads * DeinterleaveQuad(waveIndex);
uint2 quadCoord;

break;
}
uint2 cacheCoord2 = 2 * (startQuad + quadCoord) + uint2(laneIndex & 1, (laneIndex >> 1) & 1);
int2 pixelCoord2 = (int2)(tileAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
uint2 cacheCoord2 = 2 * (startQuad + quadCoord) + DeinterleaveQuad(laneIndex);
int2 pixelCoord2 = (int2)(groupAnchor + cacheCoord2) - TEXTURE_CACHE_BORDER;
float3 irradiance2 = LOAD_TEXTURE2D(_IrradianceSource, pixelCoord2).rgb;
float viewZ2 = 0;

正在加载...
取消
保存