#define VSM 0 #define EVSM_2 1 #define EVSM_4 2 #define MSM 3 #define THREADS 16 #define MAX_BLUR_SIZE 17 #pragma kernel main_VSM_3 KERNEL_MAIN=main_VSM_3 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_VSM_5 KERNEL_MAIN=main_VSM_5 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_VSM_7 KERNEL_MAIN=main_VSM_7 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_VSM_9 KERNEL_MAIN=main_VSM_9 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_VSM_11 KERNEL_MAIN=main_VSM_11 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_VSM_13 KERNEL_MAIN=main_VSM_13 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_VSM_15 KERNEL_MAIN=main_VSM_15 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_VSM_17 KERNEL_MAIN=main_VSM_17 SHADOW_MOMENT_ALGORITHM=VSM MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_EVSM_2_3 KERNEL_MAIN=main_EVSM_2_3 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_EVSM_2_5 KERNEL_MAIN=main_EVSM_2_5 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_EVSM_2_7 KERNEL_MAIN=main_EVSM_2_7 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_EVSM_2_9 KERNEL_MAIN=main_EVSM_2_9 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_EVSM_2_11 KERNEL_MAIN=main_EVSM_2_11 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_EVSM_2_13 KERNEL_MAIN=main_EVSM_2_13 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_EVSM_2_15 KERNEL_MAIN=main_EVSM_2_15 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_EVSM_2_17 KERNEL_MAIN=main_EVSM_2_17 SHADOW_MOMENT_ALGORITHM=EVSM_2 MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_EVSM_4_3 KERNEL_MAIN=main_EVSM_4_3 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_EVSM_4_5 KERNEL_MAIN=main_EVSM_4_5 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_EVSM_4_7 KERNEL_MAIN=main_EVSM_4_7 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_EVSM_4_9 KERNEL_MAIN=main_EVSM_4_9 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_EVSM_4_11 KERNEL_MAIN=main_EVSM_4_11 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_EVSM_4_13 KERNEL_MAIN=main_EVSM_4_13 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_EVSM_4_15 KERNEL_MAIN=main_EVSM_4_15 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_EVSM_4_17 KERNEL_MAIN=main_EVSM_4_17 SHADOW_MOMENT_ALGORITHM=EVSM_4 MAX_MSAA=1 BLUR_SIZE=17 #pragma kernel main_MSM_3 KERNEL_MAIN=main_MSM_3 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=3 #pragma kernel main_MSM_5 KERNEL_MAIN=main_MSM_5 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=5 #pragma kernel main_MSM_7 KERNEL_MAIN=main_MSM_7 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=7 #pragma kernel main_MSM_9 KERNEL_MAIN=main_MSM_9 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=9 #pragma kernel main_MSM_11 KERNEL_MAIN=main_MSM_11 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=11 #pragma kernel main_MSM_13 KERNEL_MAIN=main_MSM_13 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=13 #pragma kernel main_MSM_15 KERNEL_MAIN=main_MSM_15 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=15 #pragma kernel main_MSM_17 KERNEL_MAIN=main_MSM_17 SHADOW_MOMENT_ALGORITHM=MSM MAX_MSAA=1 BLUR_SIZE=17 #include "../../common.hlsl" #include "../ShadowMoments.hlsl" #define BLUR_BORDER (BLUR_SIZE / 2) #define LDS_SIZE (THREADS + BLUR_BORDER + BLUR_BORDER) #if MAX_MSAA == 1 Texture2D depthTex; #else Texture2DMS depthTex; #endif uniform uint4 srcRect; // .xy = offset, .zw = width/height uniform uint4 dstRect; // .xy = offset, .z = array slice , .w = Flags: 1 := 16bpp, 2 := 2 channels pp, 4:= reversed z uniform float4 blurWeightsStorage[3]; // Unity expects float arrays to be tightly packed static float blurWeights[12] = (float[12])blurWeightsStorage; static const int kBits_16 = 1; // 16 bits per channel static const int kChannels_2 = 2; // 2 channels per pixel static const int kReversed_z = 4; // depth buffer contains reversed z #if (SHADOW_MOMENT_ALGORITHM == VSM) # define SHADOW_MOMENTS 2 float2 DepthToMoments( float depth ) { return float2( depth, depth * depth ); } #elif SHADOW_MOMENT_ALGORITHM == EVSM_2 # define SHADOW_MOMENTS 2 uniform float evsmExponent; float2 DepthToMoments( float depth ) { float2 moments = ShadowMoments_WarpDepth( depth, evsmExponent.xx ); return float2( moments.x, moments.x * moments.x ); } #elif SHADOW_MOMENT_ALGORITHM == EVSM_4 # define SHADOW_MOMENTS 4 uniform float2 evsmExponents; float4 DepthToMoments( float depth ) { float2 moments = ShadowMoments_WarpDepth( depth, evsmExponents ); return float4( moments.xy, moments.xy * moments.xy ); } #elif SHADOW_MOMENT_ALGORITHM == MSM # define SHADOW_MOMENTS 4 float4 DepthToMoments( float depth ) { [branch] if( (dstRect.w & kBits_16) != 0 ) return ShadowMoments_Encode16MSM( depth ); else { float dsq = depth * depth; return float4( depth, dsq, depth * dsq, dsq * dsq ); } } #else # error "No valid shadow moment algorithm has been set to the define SHADOW_MOMENT_ALGORITHM." #endif #define moment_t MERGE_NAME( float, SHADOW_MOMENTS ) RWTexture2DArray outputTex; groupshared moment_t moments[LDS_SIZE][LDS_SIZE]; [numthreads( THREADS, THREADS, 1 )] void KERNEL_MAIN(uint3 dispatchId : SV_DispatchThreadID, uint3 groupThreadId : SV_GroupThreadID) { uint i, j; // because the compiler scopes like its 1999. #if MAX_MSAA > 1 uint width, height, sampleCnt; depthTex.GetDimensions(width, height, sampleCnt); sampleCnt = Clamp(0, MAX_MSAA, sampleCnt); float sampleCntRcp = 1.0 / sampleCnt; // calculate weights based on sample positions float sumWeights = 0; float sampleWeights[MAX_MSAA]; for (i = 0; i < sampleCnt; i++) { float2 spos = depthTex.GetSamplePosition( i ); sampleWeights[i] = sampleCntRcp; // TODO: find a better weight filter sumWeights += sampleWeights[i]; } sumWeights = 1.0 / sumWeights; #endif // load moments into LDS // each workgroup works on THREADS * THREADS tiles, but the blur filter requires // us to fetch enough data around the border of the current tile. // We assume that the blur filter's support does not exceed THREADS, so we fetch // the data in 4 blocks. const int blurBorder = BLUR_BORDER; const int2 ldsSize = int2( LDS_SIZE, LDS_SIZE ); const int2 threadsCnt = int2( THREADS, THREADS ); const int4 validSrc = int4( srcRect.xy, srcRect.xy + srcRect.zw - 1 ); int2 srcIdx = ((int2) dispatchId.xy) - blurBorder.xx + srcRect.xy; int2 ldsIdx = groupThreadId.xy; const bool reverse_z = (dstRect.w & kReversed_z) != 0; // calculate an average moment over all samples for a given pixel and load the result into LDS uint iw, ih, is; [unroll] for( ih = 0; ih < 2; ih++ ) { [branch] if (ldsIdx.y >= ldsSize.y) continue; [unroll] for( iw = 0; iw < 2; iw++ ) { [branch] if (ldsIdx.x >= ldsSize.x) continue; moment_t avgMoments = 0.0; #if MAX_MSAA > 1 for( is = 0; is < sampleCnt; is++ ) { float depth = depthTex.Load( int3( Clamp( srcIdx, validSrc.xy, validSrc.zw ), is ) ).x; depth = reverse_z ? (1.0 - depth) : depth; moment_t moments = DepthToMoments( depth ); avgMoments += sampleWeights[is] * moments; } avgMoments *= sumWeights; #else float depth = depthTex.Load( int3( Clamp( srcIdx, validSrc.xy, validSrc.zw ), 0 ) ).x; avgMoments = DepthToMoments( reverse_z ? (1.0-depth) : depth ); #endif moments[ldsIdx.y][ldsIdx.x] = avgMoments; ldsIdx.x += threadsCnt.x; srcIdx.x += threadsCnt.x; } ldsIdx.x = groupThreadId.x; srcIdx.x = (int) dispatchId.x - blurBorder + srcRect.x; ldsIdx.y += threadsCnt.y; srcIdx.y += threadsCnt.y; } // sync across all threads so LDS contains the moments for each pixel that we need for the blur GroupMemoryBarrierWithGroupSync(); // first pass blurs horizontally ldsIdx = groupThreadId.xy + int2( blurBorder, 0 ); moment_t hblurredMoment = 0.0, hblurredMoment2 = 0.0; int blurOffset; for( blurOffset = -blurBorder; blurOffset <= blurBorder; blurOffset++ ) { hblurredMoment += moments[ldsIdx.y][ldsIdx.x + blurOffset] * blurWeights[abs( blurOffset )]; } ldsIdx.y += threadsCnt.y; [branch] if( ldsIdx.y < ldsSize.y ) { for( blurOffset = -blurBorder; blurOffset <= blurBorder; blurOffset++ ) { hblurredMoment2 += moments[ldsIdx.y][ldsIdx.x + blurOffset] * blurWeights[abs(blurOffset)]; } } // make sure all reads/writes are done GroupMemoryBarrierWithGroupSync(); // replace LDS values with the horizontally blurred values moments[groupThreadId.y][ldsIdx.x] = hblurredMoment; [branch] if( ldsIdx.y < ldsSize.y ) moments[ldsIdx.y][ldsIdx.x] = hblurredMoment2; GroupMemoryBarrierWithGroupSync(); // second pass blurs vertically ldsIdx = groupThreadId.xy + blurBorder.xx; moment_t vblurredMoment = 0.0; for( blurOffset = -blurBorder; blurOffset <= blurBorder; blurOffset++ ) { vblurredMoment += moments[ldsIdx.y + blurOffset][ldsIdx.x] * blurWeights[abs( blurOffset )]; } // and write out the result if( all( dispatchId.xy < srcRect.zw ) ) { dispatchId.z = dstRect.z; dispatchId.xy += dstRect.xy; outputTex[dispatchId] = vblurredMoment; } }