#define VSM 0 #define EVSM_2 1 #define EVSM_4 2 #define MSM 3 #define THREADS 16 #define MAX_BLUR_SIZE (THREADS+1) #pragma kernel main_VSM_3 KERNEL_MAIN=main_VSM_3 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_VSM_5 KERNEL_MAIN=main_VSM_5 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_VSM_7 KERNEL_MAIN=main_VSM_7 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_VSM_9 KERNEL_MAIN=main_VSM_9 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_VSM_11 KERNEL_MAIN=main_VSM_11 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_VSM_13 KERNEL_MAIN=main_VSM_13 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_VSM_15 KERNEL_MAIN=main_VSM_15 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_VSM_17 KERNEL_MAIN=main_VSM_17 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_EVSM_2_3 KERNEL_MAIN=main_EVSM_2_3 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_EVSM_2_5 KERNEL_MAIN=main_EVSM_2_5 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_EVSM_2_7 KERNEL_MAIN=main_EVSM_2_7 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_EVSM_2_9 KERNEL_MAIN=main_EVSM_2_9 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_EVSM_2_11 KERNEL_MAIN=main_EVSM_2_11 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_EVSM_2_13 KERNEL_MAIN=main_EVSM_2_13 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_EVSM_2_15 KERNEL_MAIN=main_EVSM_2_15 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_EVSM_2_17 KERNEL_MAIN=main_EVSM_2_17 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_EVSM_4_3 KERNEL_MAIN=main_EVSM_4_3 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_EVSM_4_5 KERNEL_MAIN=main_EVSM_4_5 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_EVSM_4_7 KERNEL_MAIN=main_EVSM_4_7 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_EVSM_4_9 KERNEL_MAIN=main_EVSM_4_9 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_EVSM_4_11 KERNEL_MAIN=main_EVSM_4_11 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_EVSM_4_13 KERNEL_MAIN=main_EVSM_4_13 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_EVSM_4_15 KERNEL_MAIN=main_EVSM_4_15 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_EVSM_4_17 KERNEL_MAIN=main_EVSM_4_17 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_MSM_3 KERNEL_MAIN=main_MSM_3 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_MSM_5 KERNEL_MAIN=main_MSM_5 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_MSM_7 KERNEL_MAIN=main_MSM_7 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_MSM_9 KERNEL_MAIN=main_MSM_9 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_MSM_11 KERNEL_MAIN=main_MSM_11 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_MSM_13 KERNEL_MAIN=main_MSM_13 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_MSM_15 KERNEL_MAIN=main_MSM_15 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_MSM_17 KERNEL_MAIN=main_MSM_17 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_MSAA_VSM_3 KERNEL_MAIN=main_MSAA_VSM_3 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=3 #pragma kernel main_MSAA_VSM_5 KERNEL_MAIN=main_MSAA_VSM_5 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=5 #pragma kernel main_MSAA_VSM_7 KERNEL_MAIN=main_MSAA_VSM_7 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=7 #pragma kernel main_MSAA_VSM_9 KERNEL_MAIN=main_MSAA_VSM_9 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=9 #pragma kernel main_MSAA_VSM_11 KERNEL_MAIN=main_MSAA_VSM_11 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=11 #pragma kernel main_MSAA_VSM_13 KERNEL_MAIN=main_MSAA_VSM_13 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=13 #pragma kernel main_MSAA_VSM_15 KERNEL_MAIN=main_MSAA_VSM_15 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=15 #pragma kernel main_MSAA_VSM_17 KERNEL_MAIN=main_MSAA_VSM_17 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=8 BLUR_SIZE=17 #pragma kernel main_MSAA_EVSM_2_3 KERNEL_MAIN=main_MSAA_EVSM_2_3 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=3 #pragma kernel main_MSAA_EVSM_2_5 KERNEL_MAIN=main_MSAA_EVSM_2_5 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=5 #pragma kernel main_MSAA_EVSM_2_7 KERNEL_MAIN=main_MSAA_EVSM_2_7 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=7 #pragma kernel main_MSAA_EVSM_2_9 KERNEL_MAIN=main_MSAA_EVSM_2_9 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=9 #pragma kernel main_MSAA_EVSM_2_11 KERNEL_MAIN=main_MSAA_EVSM_2_11 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=11 #pragma kernel main_MSAA_EVSM_2_13 KERNEL_MAIN=main_MSAA_EVSM_2_13 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=13 #pragma kernel main_MSAA_EVSM_2_15 KERNEL_MAIN=main_MSAA_EVSM_2_15 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=15 #pragma kernel main_MSAA_EVSM_2_17 KERNEL_MAIN=main_MSAA_EVSM_2_17 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=8 BLUR_SIZE=17 #pragma kernel main_MSAA_EVSM_4_3 KERNEL_MAIN=main_MSAA_EVSM_4_3 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=3 #pragma kernel main_MSAA_EVSM_4_5 KERNEL_MAIN=main_MSAA_EVSM_4_5 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=5 #pragma kernel main_MSAA_EVSM_4_7 KERNEL_MAIN=main_MSAA_EVSM_4_7 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=7 #pragma kernel main_MSAA_EVSM_4_9 KERNEL_MAIN=main_MSAA_EVSM_4_9 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=9 #pragma kernel main_MSAA_EVSM_4_11 KERNEL_MAIN=main_MSAA_EVSM_4_11 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=11 #pragma kernel main_MSAA_EVSM_4_13 KERNEL_MAIN=main_MSAA_EVSM_4_13 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=13 #pragma kernel main_MSAA_EVSM_4_15 KERNEL_MAIN=main_MSAA_EVSM_4_15 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=15 #pragma kernel main_MSAA_EVSM_4_17 KERNEL_MAIN=main_MSAA_EVSM_4_17 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=8 BLUR_SIZE=17 #pragma kernel main_MSAA_MSM_3 KERNEL_MAIN=main_MSAA_MSM_3 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=3 #pragma kernel main_MSAA_MSM_5 KERNEL_MAIN=main_MSAA_MSM_5 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=5 #pragma kernel main_MSAA_MSM_7 KERNEL_MAIN=main_MSAA_MSM_7 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=7 #pragma kernel main_MSAA_MSM_9 KERNEL_MAIN=main_MSAA_MSM_9 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=9 #pragma kernel main_MSAA_MSM_11 KERNEL_MAIN=main_MSAA_MSM_11 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=11 #pragma kernel main_MSAA_MSM_13 KERNEL_MAIN=main_MSAA_MSM_13 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=13 #pragma kernel main_MSAA_MSM_15 KERNEL_MAIN=main_MSAA_MSM_15 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=15 #pragma kernel main_MSAA_MSM_17 KERNEL_MAIN=main_MSAA_MSM_17 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=8 BLUR_SIZE=17 #include "../../common.hlsl" #include "../ShadowMoments.hlsl" #if MAX_MSAA > 1 Texture2DMS depthTex; #else Texture2D depthTex; #endif uniform uint4 srcRect; // .xy = offset, .zw = width/height uniform uint4 dstRect; // .xy = offset, .z = array slice , .w = Flags: 1 := 16bpp, 2 := 2 channels pp, 4:= reversed z uniform float4 blurWeightsStorage[3]; // Unity expects float arrays to be tightly packed static float blurWeights[12] = (float[12])blurWeightsStorage; static const int kBits_16 = 1; // 16 bits per channel static const int kChannels_2 = 2; // 2 channels per pixel static const int kReversed_z = 4; // depth buffer contains reversed z #if (SHADOW_MOMENT_ALGORITHM == VSM) # define SHADOW_MOMENTS 2 float2 DepthToMoments( float depth ) { return float2( depth, depth * depth ); } #elif SHADOW_MOMENT_ALGORITHM == EVSM_2 # define SHADOW_MOMENTS 2 uniform float evsmExponent; float2 DepthToMoments( float depth ) { float2 moments = ShadowMoments_WarpDepth( depth, evsmExponent.xx ); return float2( moments.x, moments.x * moments.x ); } #elif SHADOW_MOMENT_ALGORITHM == EVSM_4 # define SHADOW_MOMENTS 4 uniform float2 evsmExponents; float4 DepthToMoments( float depth ) { float2 moments = ShadowMoments_WarpDepth( depth, evsmExponents ); return float4( moments.xy, moments.xy * moments.xy ); } #elif SHADOW_MOMENT_ALGORITHM == MSM # define SHADOW_MOMENTS 4 float4 DepthToMoments( float depth ) { if( (dstRect.w & kBits_16) != 0 ) return ShadowMoments_Encode16MSM( depth ); else { float dsq = depth * depth; return float4( depth, dsq, depth * dsq, dsq * dsq ); } } #else # error "No valid shadow moment algorithm has been set to the define SHADOW_MOMENT_ALGORITHM." #endif #define BLUR_BORDER (BLUR_SIZE / 2) #define LDS_STRIDE (THREADS + BLUR_BORDER + BLUR_BORDER) #define moment_t MERGE_NAME( float, SHADOW_MOMENTS ) RWTexture2DArray outputTex; groupshared float moments1[THREADS * LDS_STRIDE]; // contains the blurred first moment groupshared float moments2[THREADS * LDS_STRIDE]; // contains the blurred second moment groupshared float moments3[THREADS * LDS_STRIDE]; // contains the blurred third moment groupshared float moments4[THREADS * LDS_STRIDE]; // contains the blurred fourth moment groupshared float sampleWeights[MAX_MSAA]; groupshared float sumWeights; int getLDSIdx( int2 pos, int stride ) { // interleave two consecutive rows to avoid bank conflicts return (pos.y >> 1) * (stride << 1) + (pos.x << 1) + (pos.y & 1); } void writeToShared( moment_t val, int2 pos, int stride ) { int idx = getLDSIdx( pos, stride ); moments1[idx] = val.x; moments2[idx] = val.y; #if SHADOW_MOMENTS == 4 moments3[idx] = val.z; moments4[idx] = val.w; #endif } moment_t readFromShared( int2 pos, int stride ) { int idx = getLDSIdx( pos, stride ); moment_t res; res.x = moments1[idx]; res.y = moments2[idx]; #if SHADOW_MOMENTS == 4 res.z = moments3[idx]; res.w = moments4[idx]; #endif return res; } [numthreads( THREADS, THREADS, 1 )] void KERNEL_MAIN( uint3 dispatchId : SV_DispatchThreadID, uint3 groupThreadId : SV_GroupThreadID ) { #if MAX_MSAA > 1 uint width, height, sampleCnt; depthTex.GetDimensions( width, height, sampleCnt ); sampleCnt = Clamp( sampleCnt, 2, MAX_MSAA ); float sampleCntRcp = 1.0 / sampleCnt; // calculate weights based on sample positions if( groupThreadId.x < sampleCnt ) { float2 spos = depthTex.GetSamplePosition( groupThreadId.x ); sampleWeights[groupThreadId.x] = sampleCntRcp; } if( groupThreadId.x == 0 ) { float sum = 0.0; for( uint i = 0; i < sampleCnt; i++ ) sum += sampleWeights[i]; sumWeights = 1.0 / sum; } #endif // load moments into LDS // each workgroup works on THREADS * THREADS tiles, but the blur filter requires // us to fetch enough data around the border of the current tile. // We assume that the blur filter's support does not exceed THREADS, so we fetch // the data in 4 blocks. const bool reverse_z = (dstRect.w & kReversed_z) != 0; const int blurBorder = BLUR_BORDER; const int2 validSrc = (int2) (srcRect.xy + srcRect.zw - 1); int2 srcIdx = ((int2) dispatchId.xy) - blurBorder.xx + (int2) srcRect.xy; int2 ldsIdx = (int2) groupThreadId.xy; moment_t hblurredMoments[2]; [unroll] for( int ih = 0; ih < 2; ih++ ) { [unroll] for( int iw = 0; iw < 2; iw++ ) { if( ldsIdx.x < LDS_STRIDE ) { #if MAX_MSAA > 1 moment_t avgMoments = 0.0; [loop] for( uint is = 0; is < sampleCnt; is++ ) { float depth = depthTex.Load( min( srcIdx, validSrc ), is ).x; depth = reverse_z ? (1.0 - depth) : depth; # if SHADOW_MOMENT_ALGORITHM == MSM // We're pancaking triangles to znear in the depth pass so depth and subsequently all moments can end up being zero. // The solver ShadowMoments_SolveMSM then ends up calculating infinities and nands, which produces different results // on different vendors' GPUs. So we're adding a small safety margin here. depth = Clamp( depth, 0.001, 0.999 ); # endif avgMoments += sampleWeights[is] * DepthToMoments( depth ); } avgMoments *= sumWeights; writeToShared( avgMoments, int2( ldsIdx.x, groupThreadId.y ), LDS_STRIDE ); #else float depth = depthTex.Load( int3( min( srcIdx, validSrc ), 0 ) ).x; depth = reverse_z ? (1.0 - depth) : depth; # if SHADOW_MOMENT_ALGORITHM == MSM // We're pancaking triangles to znear in the depth pass so depth and subsequently all moments can end up being zero. // The solver ShadowMoments_SolveMSM then ends up calculating infinities and nands, which produces different results // on different vendors' GPUs. So we're adding a small safety margin here. depth = Clamp( depth, 0.001, 0.999 ); # endif writeToShared( DepthToMoments( depth ), int2( ldsIdx.x, groupThreadId.y ), LDS_STRIDE ); #endif ldsIdx.x += THREADS; srcIdx.x += THREADS; } } GroupMemoryBarrierWithGroupSync(); hblurredMoments[ih] = 0; int2 idx = { groupThreadId.x + blurBorder, groupThreadId.y }; [loop] for( int blurOffset = -blurBorder; blurOffset <= blurBorder; blurOffset++ ) { hblurredMoments[ih] += readFromShared( int2( idx.x + blurOffset, idx.y ), LDS_STRIDE ) * blurWeights[abs( blurOffset )]; } GroupMemoryBarrierWithGroupSync(); ldsIdx.x = groupThreadId.x; srcIdx.x = (int) dispatchId.x - blurBorder + srcRect.x; srcIdx.y += THREADS; } // update LDS with horizontally blurred values writeToShared( hblurredMoments[0], (int2) groupThreadId.xy, THREADS ); if( (groupThreadId.y + THREADS) < LDS_STRIDE ) writeToShared( hblurredMoments[1], int2( groupThreadId.x, groupThreadId.y + THREADS ), THREADS ); // sync threads GroupMemoryBarrierWithGroupSync(); // second pass blurs vertically ldsIdx = (int2) groupThreadId.xy + int2( 0, blurBorder ); moment_t vblurredMoment = 0.0; [unroll] for( int blurOffset = -blurBorder; blurOffset <= blurBorder; blurOffset++ ) { vblurredMoment += readFromShared( int2( ldsIdx.x, ldsIdx.y + blurOffset ), THREADS ) * blurWeights[abs(blurOffset)]; } // and write out the result if( all( dispatchId.xy < srcRect.zw ) ) { dispatchId.xy += dstRect.xy; dispatchId.z = dstRect.z; outputTex[dispatchId] = vblurredMoment; } }