
HDRenderLoop: Intermediate state

sebastienlagarde 8 年前
共有 13 个文件被更改,包括 1968 次插入23 次删除
  1. 46
  2. 6
  3. 166
  4. 9
  5. 267
  6. 9
  7. 555
  8. 9
  9. 440
  10. 9
  11. 466
  12. 9


colorMRTs[index] = RTIDs[index];
public void BindBuffers(Material mat)

public int gbufferCount { get; set; }

GBufferManager m_gbufferManager = new GBufferManager();
private int s_CameraColorBuffer;
private int s_CameraDepthBuffer;
private int s_VelocityBuffer;
private int s_DistortionBuffer;
int s_CameraColorBuffer;
int s_CameraDepthBuffer;
int s_VelocityBuffer;
int s_DistortionBuffer;
private ComputeBuffer s_punctualLightList;
private ComputeBuffer s_envLightList;
private ComputeBuffer s_areaLightList;
private ComputeBuffer s_punctualShadowList;
ComputeBuffer s_punctualLightList;
ComputeBuffer s_envLightList;
ComputeBuffer s_areaLightList;
ComputeBuffer s_punctualShadowList;
private TextureCacheCubemap m_cubeReflTexArray;
TextureCacheCubemap m_cubeReflTexArray;
TextureCache2D m_CookieTexArray;
TextureCacheCubemap m_CubeCookieTexArray;
private static int s_WidthOnRecord;
private static int s_HeightOnRecord;
static int s_WidthOnRecord;
static int s_HeightOnRecord;
void OnEnable()

m_cubeReflTexArray = new TextureCacheCubemap();
m_cubeReflTexArray.AllocTextureArray(32, (int)m_TextureSettings.reflectionCubemapSize, TextureFormat.BC6H, true);
m_CookieTexArray = new TextureCache2D();
m_CookieTexArray.AllocTextureArray(8, (int)m_TextureSettings.spotCookieSize, (int)m_TextureSettings.spotCookieSize, TextureFormat.RGBA32, true);
m_CubeCookieTexArray = new TextureCacheCubemap();
m_CubeCookieTexArray.AllocTextureArray(4, (int)m_TextureSettings.pointCookieSize, TextureFormat.RGBA32, true);
// Init Gbuffer description
m_LitRenderLoop = new Lit.RenderLoop(); // Our object can be garbacge collected, so need to be allocate here

void OnDisable()

if (m_FinalPassMaterial) DestroyImmediate(m_FinalPassMaterial);
void InitAndClearBuffer(Camera camera, RenderLoop renderLoop)

UpdatePunctualLights(cullResults.visibleLights, ref shadows);
if (true)
var numLights = GenerateSourceLightBuffers(camera, cullResults);
BuildPerTileLightLists(camera, loop, numLights, projscr, invProjscr);
var numLights = 0; // GenerateSourceLightBuffers(camera, cullResults);
m_tilePassLightLoop.BuildPerTileLightLists(camera, loop, numLights, projscr, invProjscr);
PushGlobalParams(camera, loop, CameraToWorld(camera), projscr, invProjscr, numDirLights);
m_tilePassLightLoop.PushGlobalParams(camera, loop, CameraToWorld(camera), projscr, invProjscr, numDirLights);
DoTiledDeferredLighting(camera, loop, numLights, numDirLights);
m_tilePassLightLoop.DoTiledDeferredLighting(camera, loop, numLights, numDirLights);
RenderDeferredLighting(camera, renderLoop);


Shader "Hidden/HDRenderLoop/Deferred"
_SrcBlend("", Float) = 1
_DstBlend("", Float) = 1


public const int MaxNumLights = 1024;
public const int MaxNumDirLights = 2;
public const float FltMax = 3.402823466e+38F;
ComputeShader buildScreenAABBShader;
ComputeShader buildPerTileLightListShader; // FPTL
ComputeShader buildPerBigTileLightListShader;
ComputeShader buildPerVoxelLightListShader; // clustered
private static int s_GenAABBKernel;
private static int s_GenListPerTileKernel;
private static int s_GenListPerVoxelKernel;

public bool enableDrawLightBoundsDebug = false;
public bool enableDrawTileDebug = false;
public bool enableComputeLightEvaluation = false;
const bool k_UseDepthBuffer = true;// // only has an impact when EnableClustered is true (requires a depth-prepass)
const bool k_UseDepthBuffer = true; // only has an impact when EnableClustered is true (requires a depth-prepass)
const bool k_UseAsyncCompute = true; // should not use on mobile
const int k_Log2NumClusters = 6; // accepted range is from 0 to 6. NumClusters is 1<<g_iLog2NumClusters

// clustered light list specific buffers and data end
const int k_TileSize = 16;
bool usingFptl
bool isEnabledMSAA = false;
Debug.Assert(!isEnabledMSAA || enableClustered);
bool disableFptl = (disableFptlWhenClustered && enableClustered) || isEnabledMSAA;
return !disableFptl;
// Local function
void ClearComputeBuffers()
if (s_AABBBoundsBuffer != null)
if (s_ConvexBoundsBuffer != null)
if (s_LightDataBuffer != null)
if (s_DirLightList != null)
if (enableClustered)
if (s_GlobalLightListAtomic != null)
void Rebuild()
buildScreenAABBShader = Resources.Load<ComputeShader>("Resources/srcbound");
buildPerTileLightListShader = Resources.Load<ComputeShader>("Resources/lightlistbuild");
buildPerBigTileLightListShader = Resources.Load<ComputeShader>("Resources/lightlistbuild-bigtile");
buildPerVoxelLightListShader = Resources.Load<ComputeShader>("Resources/lightlistbuild-clustered");
s_GenAABBKernel = buildScreenAABBShader.FindKernel("ScreenBoundsAABB");
s_GenListPerTileKernel = buildPerTileLightListShader.FindKernel(enableBigTilePrepass ? "TileLightListGen_SrcBigTile" : "TileLightListGen");
s_AABBBoundsBuffer = new ComputeBuffer(2 * MaxNumLights, 3 * sizeof(float));
s_ConvexBoundsBuffer = new ComputeBuffer(MaxNumLights, System.Runtime.InteropServices.Marshal.SizeOf(typeof(SFiniteLightBound)));
s_LightDataBuffer = new ComputeBuffer(MaxNumLights, System.Runtime.InteropServices.Marshal.SizeOf(typeof(SFiniteLightData)));
s_DirLightList = new ComputeBuffer(MaxNumDirLights, System.Runtime.InteropServices.Marshal.SizeOf(typeof(DirectionalLight)));
buildScreenAABBShader.SetBuffer(s_GenAABBKernel, "g_data", s_ConvexBoundsBuffer);
//m_BuildScreenAABBShader.SetBuffer(kGenAABBKernel, "g_vBoundsBuffer", m_aabbBoundsBuffer);
buildPerTileLightListShader.SetBuffer(s_GenListPerTileKernel, "g_vBoundsBuffer", s_AABBBoundsBuffer);
buildPerTileLightListShader.SetBuffer(s_GenListPerTileKernel, "g_vLightData", s_LightDataBuffer);
buildPerTileLightListShader.SetBuffer(s_GenListPerTileKernel, "g_data", s_ConvexBoundsBuffer);
if (enableClustered)
var kernelName = enableBigTilePrepass ? (k_UseDepthBuffer ? "TileLightListGen_DepthRT_SrcBigTile" : "TileLightListGen_NoDepthRT_SrcBigTile") : (k_UseDepthBuffer ? "TileLightListGen_DepthRT" : "TileLightListGen_NoDepthRT");
s_GenListPerVoxelKernel = buildPerVoxelLightListShader.FindKernel(kernelName);
s_ClearVoxelAtomicKernel = buildPerVoxelLightListShader.FindKernel("ClearAtomic");
buildPerVoxelLightListShader.SetBuffer(s_GenListPerVoxelKernel, "g_vBoundsBuffer", s_AABBBoundsBuffer);
buildPerVoxelLightListShader.SetBuffer(s_GenListPerVoxelKernel, "g_vLightData", s_LightDataBuffer);
buildPerVoxelLightListShader.SetBuffer(s_GenListPerVoxelKernel, "g_data", s_ConvexBoundsBuffer);
s_GlobalLightListAtomic = new ComputeBuffer(1, sizeof(uint));
if (enableBigTilePrepass)
s_GenListPerBigTileKernel = buildPerBigTileLightListShader.FindKernel("BigTileLightListGen");
buildPerBigTileLightListShader.SetBuffer(s_GenListPerBigTileKernel, "g_vBoundsBuffer", s_AABBBoundsBuffer);
buildPerBigTileLightListShader.SetBuffer(s_GenListPerBigTileKernel, "g_vLightData", s_LightDataBuffer);
buildPerBigTileLightListShader.SetBuffer(s_GenListPerBigTileKernel, "g_data", s_ConvexBoundsBuffer);
void OnDisable()
if (enableClustered)
public bool NeedResize()

void BuildPerTileLightLists(Camera camera, RenderLoop loop, int numLights, Matrix4x4 projscr, Matrix4x4 invProjscr)
var w = camera.pixelWidth;
var h = camera.pixelHeight;
var numTilesX = (w + 15) / 16;
var numTilesY = (h + 15) / 16;
var numBigTilesX = (w + 63) / 64;
var numBigTilesY = (h + 63) / 64;
var cmd = new CommandBuffer() { name = "Build light list" };
// generate screen-space AABBs (used for both fptl and clustered).
var proj = CameraProjection(camera);
var temp = new Matrix4x4();
temp.SetRow(0, new Vector4(1.0f, 0.0f, 0.0f, 0.0f));
temp.SetRow(1, new Vector4(0.0f, 1.0f, 0.0f, 0.0f));
temp.SetRow(2, new Vector4(0.0f, 0.0f, 0.5f, 0.5f));
temp.SetRow(3, new Vector4(0.0f, 0.0f, 0.0f, 1.0f));
var projh = temp * proj;
var invProjh = projh.inverse;
cmd.SetComputeIntParam(buildScreenAABBShader, "g_iNrVisibLights", numLights);
SetMatrixCS(cmd, buildScreenAABBShader, "g_mProjection", projh);
SetMatrixCS(cmd, buildScreenAABBShader, "g_mInvProjection", invProjh);
cmd.SetComputeBufferParam(buildScreenAABBShader, s_GenAABBKernel, "g_vBoundsBuffer", s_AABBBoundsBuffer);
cmd.DispatchCompute(buildScreenAABBShader, s_GenAABBKernel, (numLights + 7) / 8, 1, 1);
// enable coarse 2D pass on 64x64 tiles (used for both fptl and clustered).
if (enableBigTilePrepass)
cmd.SetComputeIntParams(buildPerBigTileLightListShader, "g_viDimensions", new int[2] { w, h });
cmd.SetComputeIntParam(buildPerBigTileLightListShader, "g_iNrVisibLights", numLights);
SetMatrixCS(cmd, buildPerBigTileLightListShader, "g_mScrProjection", projscr);
SetMatrixCS(cmd, buildPerBigTileLightListShader, "g_mInvScrProjection", invProjscr);
cmd.SetComputeFloatParam(buildPerBigTileLightListShader, "g_fNearPlane", camera.nearClipPlane);
cmd.SetComputeFloatParam(buildPerBigTileLightListShader, "g_fFarPlane", camera.farClipPlane);
cmd.SetComputeBufferParam(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, "g_vLightList", s_BigTileLightList);
cmd.DispatchCompute(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, numBigTilesX, numBigTilesY, 1);
if (usingFptl) // optimized for opaques only
cmd.SetComputeIntParams(buildPerTileLightListShader, "g_viDimensions", new int[2] { w, h });
cmd.SetComputeIntParam(buildPerTileLightListShader, "g_iNrVisibLights", numLights);
SetMatrixCS(cmd, buildPerTileLightListShader, "g_mScrProjection", projscr);
SetMatrixCS(cmd, buildPerTileLightListShader, "g_mInvScrProjection", invProjscr);
cmd.SetComputeTextureParam(buildPerTileLightListShader, s_GenListPerTileKernel, "g_depth_tex", new RenderTargetIdentifier(s_CameraDepthTexture));
cmd.SetComputeBufferParam(buildPerTileLightListShader, s_GenListPerTileKernel, "g_vLightList", s_LightList);
if (enableBigTilePrepass) cmd.SetComputeBufferParam(buildPerTileLightListShader, s_GenListPerTileKernel, "g_vBigTileLightList", s_BigTileLightList);
cmd.DispatchCompute(buildPerTileLightListShader, s_GenListPerTileKernel, numTilesX, numTilesY, 1);
if (enableClustered) // works for transparencies too.
VoxelLightListGeneration(cmd, camera, numLights, projscr, invProjscr);


fileFormatVersion: 2
guid: 10637537837597a41861afbe118b246a
folderAsset: yes
timeCreated: 1479306736
licenseType: Pro


#pragma kernel BigTileLightListGen
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#include "SortingComputeUtils.hlsl"
uniform int g_iNrVisibLights;
uniform uint2 g_viDimensions;
uniform float4x4 g_mInvScrProjection;
uniform float4x4 g_mScrProjection;
uniform float g_fNearPlane;
uniform float g_fFarPlane;
StructuredBuffer<float3> g_vBoundsBuffer : register( t1 );
StructuredBuffer<SFiniteLightData> g_vLightData : register( t2 );
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
#define NR_THREADS 64
// output buffer
RWBuffer<uint> g_vLightList : register( u0 );
// 2kB (room for roughly 30 wavefronts)
groupshared unsigned int lightsListLDS[MAX_NR_BIGTILE_LIGHTS_PLUSONE];
groupshared uint lightOffs;
float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far
float3 vP = float3(0.0f,0.0f,zDptBufSpace);
float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
return v4Pres.z / v4Pres.w;
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
float fSx = g_mScrProjection[0].x;
float fCx = g_mScrProjection[0].z;
float fSy = g_mScrProjection[1].y;
float fCy = g_mScrProjection[1].z;
return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 );
return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 );
float GetOnePixDiagWorldDistAtDepthOne()
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR);
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrBigTilesX = (iWidth+63)/64;
uint nrBigTilesY = (iHeight+63)/64;
if(t==0) lightOffs = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
uint2 viTilLL = 64*tileIDX;
uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);
// build coarse list using AABB
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
const float2 vMi = g_vBoundsBuffer[l].xy;
const float2 vMa = g_vBoundsBuffer[l+g_iNrVisibLights].xy;
if( all(vMa>vTileLL) && all(vMi<vTileUR))
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_BIGTILE_LIGHTS) lightsListLDS[uIndex] = l; // add to light list
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
int iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS);
SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))) );
CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy);
// sort lights
lightOffs = 0;
for(int i=t; i<iNrCoarseLights; i+=NR_THREADS) if(lightsListLDS[i]<g_iNrVisibLights) InterlockedAdd(lightOffs, 1);
iNrCoarseLights = lightOffs;
int offs = tileIDX.y*nrBigTilesX + tileIDX.x;
for(int i=t; i<(iNrCoarseLights+1); i+=NR_THREADS)
g_vLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*offs + i] = t==0 ? iNrCoarseLights : lightsListLDS[i-1];
void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 32*onePixDiagDist; // scale by half a tile
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
SFiniteLightBound lgtDat = g_data[lightsListLDS[l]];
if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius) )
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane)
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane;
z = -z;
return GetViewPosFromLinDepth( float2(x, y), z);
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
int iSection = e0>>2; // section 0 is side edges, section 1 is near edges and section 2 is far edges
int iSwizzle = e0&0x3;
int i=iSwizzle + (2*(iSection&0x2)); // offset by 4 at section 2
vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane);
vE0 = iSection==0 ? vP0 : (((iSwizzle&0x2)==0 ? 1.0f : (-1.0f))*((iSwizzle&0x1)==(iSwizzle>>1) ? float3(1,0,0) : float3(0,1,0)));
void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR)
const bool bOnlyNeedFrustumSideEdges = true;
const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull.
const int totNrEdgePairs = 12*nrFrustEdges;
for(int l=0; l<iNrCoarseLights; l++)
const int idxCoarse = lightsListLDS[l];
[branch]if(idxCoarse<(uint) g_iNrVisibLights && g_vLightData[idxCoarse].lightType!=SPHERE_LIGHT) // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
SFiniteLightBound lgtDat = g_data[idxCoarse];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light)
const float3 center = lgtDat.center.xyz;
const float2 scaleXY = lgtDat.scaleXY;
for(int i=threadID; i<totNrEdgePairs; i+=NR_THREADS)
int e0 = (int) (((uint)i)/((uint) nrFrustEdges)); // should become a shift right
int e1 = i - e0*nrFrustEdges;
int idx_cur=0, idx_twin=0;
float3 vP0, vE0;
GetHullEdge(idx_cur, idx_twin, vP0, vE0, e0, boxX, boxY, boxZ, center, scaleXY);
float3 vP1, vE1;
GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, g_fFarPlane);
// potential separation plane
float3 vN = cross(vE0, vE1);
int positive=0, negative=0;
for(int k=1; k<8; k++) // only need to test 7 verts (technically just 6).
int j = (idx_cur+k)&0x7;
float3 vPh = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, j);
float fSignDist = idx_twin==j ? 0.0 : dot(vN, vPh-vP0);
if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;
int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
positive=0; negative=0;
for(int j=0; j<8; j++)
float3 vPf = GetTileVertex(viTilLL, viTilUR, j, g_fFarPlane);
float fSignDist = dot(vN, vPf-vP0);
if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;
int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
bool bFoundSepPlane = (resh*resf)<0;
if(bFoundSepPlane) lightsListLDS[l]=0xffffffff;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)


fileFormatVersion: 2
guid: d190937525dcb3949942a0a93295d2fe
timeCreated: 1479306737
licenseType: Pro
currentAPIMask: 4


#pragma kernel TileLightListGen_NoDepthRT LIGHTLISTGEN=TileLightListGen_NoDepthRT
#pragma kernel TileLightListGen_DepthRT LIGHTLISTGEN=TileLightListGen_DepthRT ENABLE_DEPTH_TEXTURE_BACKPLANE
#pragma kernel TileLightListGen_NoDepthRT_SrcBigTile LIGHTLISTGEN=TileLightListGen_NoDepthRT_SrcBigTile USE_TWO_PASS_TILED_LIGHTING
#pragma kernel ClearAtomic
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
#include "SortingComputeUtils.hlsl"
uniform int g_iNrVisibLights;
uniform float4x4 g_mInvScrProjection;
uniform float4x4 g_mScrProjection;
uniform float g_fClustScale;
uniform float g_fClustBase;
uniform float g_fNearPlane;
uniform float g_fFarPlane;
uniform int g_iLog2NumClusters; // numClusters = (1<<g_iLog2NumClusters)
#include "ClusteredUtils.h"
Texture2DMS<float> g_depth_tex : register( t0 );
Texture2D g_depth_tex : register( t0 );
StructuredBuffer<float3> g_vBoundsBuffer : register( t1 );
StructuredBuffer<SFiniteLightData> g_vLightData : register( t2 );
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
Buffer<uint> g_vBigTileLightList : register( t4 );
#define NR_THREADS 64
// output buffer
RWBuffer<uint> g_vLayeredLightList : register( u0 );
RWBuffer<uint> g_LayeredOffset : register( u1 );
RWBuffer<uint> g_LayeredSingleIdxBuffer : register( u2 );
RWBuffer<float> g_logBaseBuffer : register( u3 );
groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES/2];
groupshared float4 lightPlanes[4*6];
groupshared uint lightOffs;
groupshared int ldsZMax;
groupshared uint ldsIsLightInvisible;
groupshared uint lightOffs2;
groupshared uint lightOffsSph;
float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far
float3 vP = float3(0.0f,0.0f,zDptBufSpace);
float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
return v4Pres.z / v4Pres.w;
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
float fSx = g_mScrProjection[0].x;
float fCx = g_mScrProjection[0].z;
float fSy = g_mScrProjection[1].y;
float fCy = g_mScrProjection[1].z;
return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 );
return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 );
float GetOnePixDiagWorldDistAtDepthOne()
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane);
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
// returns 1 for intersection and 0 for none
float4 FetchPlane(int l, int p);
bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase)
unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
bool bIsHit = ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff);
float depthAtNearZ = ClusterIdxToZ(k, suggestedBase);
float depthAtFarZ = ClusterIdxToZ(k+1, suggestedBase);
for(int p=0; p<6; p++)
float4 plane = lightPlanes[6*(l&3)+p];
bool bAllInvisib = true;
for(int i=0; i<8; i++)
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
float z = (i&4)==0 ? depthAtNearZ : depthAtFarZ;
float3 vP = GetViewPosFromLinDepth( float2(x, y), z);
bAllInvisib = bAllInvisib && dot(plane, float4(vP,1.0))>0;
if(bAllInvisib) bIsHit = false;
return bIsHit;
bool CheckIntersectionBasic(int l, int k)
unsigned int val = (clusterIdxs[l>>1]>>(16*(l&1)))&0xffff;
return ((val>>0)&0xff)<=((uint) k) && ((uint) k)<=((val>>8)&0xff);
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
uint iWidth;
uint iHeight;
uint iNumSamplesMSAA;
g_depth_tex.GetDimensions(iWidth, iHeight, iNumSamplesMSAA);
g_depth_tex.GetDimensions(iWidth, iHeight);
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
uint2 viTilLL = 16*tileIDX;
uint2 viTilUR = min( viTilLL+uint2(16,16), uint2(iWidth, iHeight) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
lightOffs = 0;
ldsZMax = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
float dpt_ma=1.0;
// establish min and max depth first
for(int idx=t; idx<256; idx+=NR_THREADS)
uint2 uPixCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
for(int i=0; i<iNumSamplesMSAA; i++)
const float fDpth = FetchDepthMSAA(g_depth_tex, uPixCrd, i);
const float fDpth = FetchDepth(g_depth_tex, uPixCrd);
if(fDpth<VIEWPORT_SCALE_Z) // if not skydome
dpt_ma = max(fDpth, dpt_ma);
InterlockedMax(ldsZMax, asuint(dpt_ma) );
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
dpt_ma = asfloat(ldsZMax);
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, 0.0);
float3 vTileUR = float3(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight, 1.0);
// build coarse list using AABB
int NrBigTilesX = (nrTilesX+3)>>2;
const int bigTileIdx = (tileIDX.y>>2)*NrBigTilesX + (tileIDX.x>>2); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+0];
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
int l = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+l0+1];
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
const float3 vMi = g_vBoundsBuffer[l];
const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights];
if( all(vMa.xy>vTileLL.xy) && all(vMi.xy<vTileUR.xy))
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
float fTileFarPlane = GetLinearDepth(dpt_ma);
float fTileFarPlane = -GetLinearDepth(dpt_ma);
float suggestedBase = SuggestLogBase50(fTileFarPlane);
float fTileFarPlane = g_fFarPlane;
float suggestedBase = g_fClustBase;
iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane);
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
//////////// cell specific code
for(int l=(int) t; l<((iNrCoarseLights+1)>>1); l += NR_THREADS)
const int l0 = coarseList[2*l+0], l1 = coarseList[min(2*l+1,iNrCoarseLights)];
const unsigned int clustIdxMi0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0].z), suggestedBase));
const unsigned int clustIdxMa0 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l0+g_iNrVisibLights].z), suggestedBase));
const unsigned int clustIdxMi1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1].z), suggestedBase));
const unsigned int clustIdxMa1 = (const unsigned int) min(255,SnapToClusterIdx(GetLinearDepth(g_vBoundsBuffer[l1+g_iNrVisibLights].z), suggestedBase));
clusterIdxs[l] = (clustIdxMa1<<24) | (clustIdxMi1<<16) | (clustIdxMa0<<8) | (clustIdxMi0<<0);
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
int nrClusters = (1<<g_iLog2NumClusters);
uint start = 0;
int i=(int) t;
int iSpaceAvail = 0;
int iSum = 0;
for(int l=0; l<iNrCoarseLights; l++)
iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint) iSpaceAvail, start); // alloc list memory
int modelListCount[NR_LIGHT_MODELS]={0,0}; // direct light count and reflection lights
uint offs = start;
for(int ll=0; ll<iNrCoarseLights; ll+=4)
int p = i>>2;
int m = i&3;
if(i<24) lightPlanes[6*m+p] = FetchPlane(min(iNrCoarseLights-1,ll+m), p);
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
for(int l=ll; l<min(iNrCoarseLights,(ll+4)); l++)
if(offs<(start+iSpaceAvail) && i<nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase) )
uint lightModel = g_vLightData[ coarseList[l] ].lightModel;
++modelListCount[ lightModel==REFLECTION_LIGHT ? 1 : 0];
g_vLayeredLightList[offs++] = coarseList[l]; // reflection lights will be last since we sorted
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
uint localOffs=0;
offs = i*nrTilesX*nrTilesY + tileIDX.y*nrTilesX + tileIDX.x;
for(int m=0; m<NR_LIGHT_MODELS; m++)
int numLights = min(modelListCount[m],31); // only allow 5 bits
g_LayeredOffset[offs] = (start+localOffs) | (((uint) numLights)<<27);
offs += (nrClusters*nrTilesX*nrTilesY);
localOffs += modelListCount[m]; // use unclamped count for localOffs
g_logBaseBuffer[tileIDX.y*nrTilesX + tileIDX.x] = suggestedBase;
float4 FetchPlane(int l, int p)
SFiniteLightBound lgtDat = g_data[coarseList[l]];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light)
const float3 center = lgtDat.center.xyz;
const float radius = lgtDat.radius;
const float2 scaleXY = lgtDat.scaleXY;
return GetPlaneEq(boxX, boxY, boxZ, center, scaleXY, p);
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
SFiniteLightBound lgtDat = g_data[coarseList[l]];
if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius) )
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
// to greedy to double buffer coarseList lds on this so serializing removal of gaps.
int offs = 0;
for(int l=0; l<iNrCoarseLights; l++)
{ if(coarseList[l]!=0xffffffff) coarseList[offs++] = coarseList[l]; }
lightOffsSph = offs;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
return lightOffsSph;
float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane)
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane;
z = -z;
return GetViewPosFromLinDepth( float2(x, y), z);
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
int iSection = e0>>2; // section 0 is side edges, section 1 is near edges and section 2 is far edges
int iSwizzle = e0&0x3;
int i=iSwizzle + (2*(iSection&0x2)); // offset by 4 at section 2
vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane);
vE0 = iSection==0 ? vP0 : (((iSwizzle&0x2)==0 ? 1.0f : (-1.0f))*((iSwizzle&0x1)==(iSwizzle>>1) ? float3(1,0,0) : float3(0,1,0)));
int CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane)
if(threadID==0) lightOffs2 = 0;
const bool bOnlyNeedFrustumSideEdges = true;
const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull.
const int totNrEdgePairs = 12*nrFrustEdges;
for(int l=0; l<iNrCoarseLights; l++)
if(threadID==0) ldsIsLightInvisible=0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
const int idxCoarse = coarseList[l];
[branch]if(g_vLightData[idxCoarse].lightType!=SPHERE_LIGHT) // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
SFiniteLightBound lgtDat = g_data[idxCoarse];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light)
const float3 center = lgtDat.center.xyz;
const float2 scaleXY = lgtDat.scaleXY;
for(int i=threadID; i<totNrEdgePairs; i+=NR_THREADS)
int e0 = (int) (((uint)i)/((uint) nrFrustEdges)); // should become a shift right
int e1 = i - e0*nrFrustEdges;
int idx_cur=0, idx_twin=0;
float3 vP0, vE0;
GetHullEdge(idx_cur, idx_twin, vP0, vE0, e0, boxX, boxY, boxZ, center, scaleXY);
float3 vP1, vE1;
GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, fTileFarPlane);
// potential separation plane
float3 vN = cross(vE0, vE1);
int positive=0, negative=0;
for(int k=1; k<8; k++) // only need to test 7 verts (technically just 6).
int j = (idx_cur+k)&0x7;
float3 vPh = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, j);
float fSignDist = idx_twin==j ? 0.0 : dot(vN, vPh-vP0);
if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;
int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
positive=0; negative=0;
for(int j=0; j<8; j++)
float3 vPf = GetTileVertex(viTilLL, viTilUR, j, fTileFarPlane);
float fSignDist = dot(vN, vPf-vP0);
if(fSignDist>0) ++positive; else if(fSignDist<0) ++negative;
int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
bool bFoundSepPlane = (resh*resf)<0;
if(bFoundSepPlane) InterlockedOr(ldsIsLightInvisible, 1);
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
if(threadID==0 && ldsIsLightInvisible==0)
coarseList[lightOffs2++] = coarseList[l];
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
return lightOffs2;
[numthreads(1, 1, 1)]
void ClearAtomic(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)


fileFormatVersion: 2
guid: a19ed36b92650254397f2a566ed76d36
timeCreated: 1479306737
licenseType: Pro
currentAPIMask: 4


// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
// https://github.com/wolfgangfengel/GPU-Pro-7
#pragma kernel TileLightListGen LIGHTLISTGEN=TileLightListGen
#pragma kernel TileLightListGen_SrcBigTile LIGHTLISTGEN=TileLightListGen_SrcBigTile USE_TWO_PASS_TILED_LIGHTING
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs.hlsl"
#include "LightingConvexHullUtils.hlsl"
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
#include "SortingComputeUtils.hlsl"
uniform int g_iNrVisibLights;
uniform uint2 g_viDimensions;
uniform float4x4 g_mInvScrProjection;
uniform float4x4 g_mScrProjection;
Texture2D g_depth_tex : register( t0 );
StructuredBuffer<float3> g_vBoundsBuffer : register( t1 );
StructuredBuffer<SFiniteLightData> g_vLightData : register( t2 );
StructuredBuffer<SFiniteLightBound> g_data : register( t3 );
Buffer<uint> g_vBigTileLightList : register( t4 );
#define NR_THREADS 64
// output buffer
RWBuffer<uint> g_vLightList : register( u0 );
groupshared unsigned int coarseList[MAX_NR_COARSE_ENTRIES];
groupshared unsigned int prunedList[MAX_NR_COARSE_ENTRIES]; // temporarily support room for all 64 while in LDS
groupshared uint ldsZMin;
groupshared uint ldsZMax;
groupshared uint lightOffs;
groupshared uint ldsDoesLightIntersect[2];
groupshared int ldsNrLightsFinal;
groupshared int ldsModelListCount[NR_LIGHT_MODELS]; // since NR_LIGHT_MODELS is 2
groupshared uint lightOffsSph;
//float GetLinearDepth(float3 vP)
// float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
// return v4Pres.z / v4Pres.w;
float GetLinearDepth(float zDptBufSpace) // 0 is near 1 is far
float3 vP = float3(0.0f,0.0f,zDptBufSpace);
float4 v4Pres = mul(g_mInvScrProjection, float4(vP,1.0));
return v4Pres.z / v4Pres.w;
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
float fSx = g_mScrProjection[0].x;
float fCx = g_mScrProjection[0].z;
float fSy = g_mScrProjection[1].y;
float fCy = g_mScrProjection[1].z;
return fLinDepth*float3( ((v2ScrPos.x-fCx)/fSx), ((v2ScrPos.y-fCy)/fSy), 1.0 );
return fLinDepth*float3( -((v2ScrPos.x+fCx)/fSx), -((v2ScrPos.y+fCy)/fSy), 1.0 );
float GetOnePixDiagWorldDistAtDepthOne()
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
return length( float2(1.0/fSx,1.0/fSy) );
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate);
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths);
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
float4 vLinDepths;
// Fetch depths and calculate min/max
for(int i = 0; i < 4; i++)
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(g_depth_tex, uCrd);
vLinDepths[i] = GetLinearDepth(fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;
// build coarse list using AABB
int NrBigTilesX = (nrTilesX+3)>>2;
const int bigTileIdx = (tileIDX.y>>2)*NrBigTilesX + (tileIDX.x>>2); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+0];
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
int l = g_vBigTileLightList[MAX_NR_BIGTILE_LIGHTS_PLUSONE*bigTileIdx+l0+1];
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
const float3 vMi = g_vBoundsBuffer[l];
const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights];
if( all(vMa>vTileLL) && all(vMi<vTileUR))
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
if(t<2) ldsDoesLightIntersect[t] = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
if((int)t<iNrCoarseLights) prunedList[t] = coarseList[t];
if(t==0) ldsNrLightsFinal=iNrCoarseLights;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
if(t<NR_LIGHT_MODELS) ldsModelListCount[t]=0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
int nrLightsCombinedList = min(ldsNrLightsFinal,MAX_NR_COARSE_ENTRIES);
for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)
InterlockedAdd(ldsModelListCount[ g_vLightData[ prunedList[i] ].lightModel ], 1);
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
// write lights to global buffers
int localOffs=0;
int offs = tileIDX.y*nrTilesX + tileIDX.x;
for(int m=0; m<NR_LIGHT_MODELS; m++)
int nrLightsFinal = ldsModelListCount[ m ];
int nrLightsFinalClamped = nrLightsFinal<MAX_NR_PRUNED_ENTRIES ? nrLightsFinal : MAX_NR_PRUNED_ENTRIES;
const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)
uint uLow = l==0 ? nrLightsFinalClamped : prunedList[2*l-1+localOffs];
uint uHigh = prunedList[2*l+0+localOffs];
g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16);
localOffs += nrLightsFinal;
offs += (nrTilesX*nrTilesY);
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
lightOffsSph = 0;
// make a copy of coarseList in prunedList.
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0);
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8*onePixDiagDist; // scale by half a tile
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
SFiniteLightBound lightData = g_data[coarseList[l]];
if( DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius) )
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex]=prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
return lightOffsSph;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths)
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint uLightsFlags[2] = {0,0};
int l=0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
// fetch light
int idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uint uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
// spot
while(l<iNrCoarseLights && uLgtType==SPOT_LIGHT)
SFiniteLightData lightData = g_vLightData[idxCoarse];
const bool bIsSpotDisc = (lightData.flags&IS_CIRCULAR_SPOT_SHAPE)!=0;
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 fromLight = vVPos-lightData.lightPos.xyz;
float distSq = dot(fromLight,fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs( float2( dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz) ) );
float fDist2D = bIsSpotDisc ? length(V) : max(V.x,V.y);
if( all( float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D*lightData.cotan) ) ) uVal = 1;
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
// sphere
while(l<iNrCoarseLights && uLgtType==SPHERE_LIGHT)
SFiniteLightData lightData = g_vLightData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight,toLight);
if(lightData.radiusSq>distSq) uVal = 1;
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
// Box
while(l<iNrCoarseLights && uLgtType==BOX_LIGHT)
SFiniteLightData lightData = g_vLightData[idxCoarse];
// serially check 4 pixels
uint uVal = 0;
for(int i=0; i<4; i++)
int idx = t + i*NR_THREADS;
uint2 uPixLoc = min(uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5,0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
float3 dist = float3( dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ) );
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
if( max(max(dist.x, dist.y), dist.z)<1 ) uVal = 1; // but allows us to not write out OuterDists
uLightsFlags[l<32 ? 0 : 1] |= (uVal<<(l&31));
++l; idxCoarse = l<iNrCoarseLights ? coarseList[l] : 0;
uLgtType = l<iNrCoarseLights ? g_vLightData[idxCoarse].lightType : 0;
// in case we have some corrupt data make sure we terminate
if(uLgtType>=MAX_TYPES) ++l;
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
if(t==0) ldsNrLightsFinal = 0;
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
if(t<(uint) iNrCoarseLights && (ldsDoesLightIntersect[t<32 ? 0 : 1]&(1<<(t&31)))!=0 )
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
if(uIndex<MAX_NR_COARSE_ENTRIES) prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.


fileFormatVersion: 2
guid: 65af3444cbf4b3747a4dead7ee00cfee
timeCreated: 1479306737
licenseType: Pro
currentAPIMask: 4


// The implementation is based on the demo on "fine pruned tiled lighting" published in GPU Pro 7.
// https://github.com/wolfgangfengel/GPU-Pro-7
#pragma kernel ScreenBoundsAABB
#include "..\common\ShaderBase.h"
#include "LightDefinitions.cs.hlsl"
uniform int g_iNrVisibLights;
uniform float4x4 g_mInvProjection;
uniform float4x4 g_mProjection;
StructuredBuffer<SFiniteLightBound> g_data : register( t0 );
#define FLT_EPSILON 1.192092896e-07F // smallest such that 1.0+FLT_EPSILON != 1.0
#define NR_THREADS 64
// output buffer
RWStructuredBuffer<float3> g_vBoundsBuffer : register( u0 );
#define MAX_PNTS 9 // strictly this should be 10=6+4 but we get more wavefronts and 10 seems to never hit (fingers crossed)
// However, worst case the plane that would be skipped if such an extreme case ever happened would be backplane
// clipping gets skipped which doesn't cause any errors.
// LDS (2496 bytes)
groupshared float posX[MAX_PNTS*8*2];
groupshared float posY[MAX_PNTS*8*2];
groupshared float posZ[MAX_PNTS*8*2];
groupshared float posW[MAX_PNTS*8*2];
groupshared unsigned int clipFlags[48];
unsigned int GetClip(const float4 P);
int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p);
void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r);
#include "LightingConvexHullUtils.hlsl"
[numthreads(NR_THREADS, 1, 1)]
void ScreenBoundsAABB(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
uint groupID = u3GroupID.x;
//uint vindex = groupID * NR_THREADS + threadID;
unsigned int g = groupID;
unsigned int t = threadID;
const int subLigt = (int) (t/8);
const int lgtIndex = subLigt+(int) g*8;
const int sideIndex = (int) (t%8);
SFiniteLightBound lgtDat = g_data[lgtIndex];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light)
const float3 center = lgtDat.center.xyz;
const float radius = lgtDat.radius;
const float2 scaleXY = lgtDat.scaleXY;
if(sideIndex<6 && lgtIndex<(int) g_iNrVisibLights) // mask 2 out of 8 threads
float3 q0, q1, q2, q3;
GetQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, sideIndex);
const float4 vP0 = mul(g_mProjection, float4(q0, 1));
const float4 vP1 = mul(g_mProjection, float4(q1, 1));
const float4 vP2 = mul(g_mProjection, float4(q2, 1));
const float4 vP3 = mul(g_mProjection, float4(q3, 1));
// test vertices of one quad (of the convex hull) for intersection
const unsigned int uFlag0 = GetClip(vP0);
const unsigned int uFlag1 = GetClip(vP1);
const unsigned int uFlag2 = GetClip(vP2);
const unsigned int uFlag3 = GetClip(vP3);
const float4 vPnts[] = {vP0, vP1, vP2, vP3};
// screen-space AABB of one quad (assuming no intersection)
float3 vMin, vMax;
for(int k=0; k<4; k++)
float fW = vPnts[k].w;
float fS = fW<0 ? -1 : 1;
float fWabs = fW<0 ? (-fW) : fW;
fW = fS * (fWabs<FLT_EPSILON ? FLT_EPSILON : fWabs);
float3 vP = float3(vPnts[k].x/fW, vPnts[k].y/fW, vPnts[k].z/fW);
if(k==0) { vMin=vP; vMax=vP; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
clipFlags[subLigt*6+sideIndex] = (uFlag0<<0) | (uFlag1<<6) | (uFlag2<<12) | (uFlag3<<18);
// store in clip buffer (only use these vMin and vMax if light is 100% visible in which case clipping isn't needed)
posX[subLigt*MAX_PNTS*2 + sideIndex] = vMin.x;
posY[subLigt*MAX_PNTS*2 + sideIndex] = vMin.y;
posZ[subLigt*MAX_PNTS*2 + sideIndex] = vMin.z;
posX[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.x;
posY[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.y;
posZ[subLigt*MAX_PNTS*2 + sideIndex + 6] = vMax.z;
// if not XBONE and not PLAYSTATION4 we need a memorybarrier here
// since we can't rely on the gpu cores being 64 wide.
// We need a pound define around this.
int f=0;
if(sideIndex==0 && lgtIndex<(int) g_iNrVisibLights)
// quick acceptance or rejection
unsigned int uCollectiveAnd = (unsigned int) -1;
unsigned int uCollectiveOr = 0;
for(f=0; f<6; f++)
unsigned int uFlagAnd = clipFlags[subLigt*6+f]&0x3f;
unsigned int uFlagOr = uFlagAnd;
for(int i=1; i<4; i++)
unsigned int uClipBits = (clipFlags[subLigt*6+f]>>(i*6))&0x3f;
uFlagAnd &= uClipBits;
uFlagOr |= uClipBits;
uCollectiveAnd &= uFlagAnd;
uCollectiveOr |= uFlagOr;
bool bSetBoundYet = false;
float3 vMin=0.0, vMax=0.0;
if(uCollectiveAnd!=0 || uCollectiveOr==0) // all invisible or all visible (early out)
if(uCollectiveOr==0) // all visible
for(f=0; f<6; f++)
const int sideIndex = f;
float3 vFaceMi = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 0], posY[subLigt*MAX_PNTS*2 + sideIndex + 0], posZ[subLigt*MAX_PNTS*2 + sideIndex + 0]);
float3 vFaceMa = float3(posX[subLigt*MAX_PNTS*2 + sideIndex + 6], posY[subLigt*MAX_PNTS*2 + sideIndex + 6], posZ[subLigt*MAX_PNTS*2 + sideIndex + 6]);
for(int k=0; k<2; k++)
float3 vP = k==0 ? vFaceMi : vFaceMa;
if(f==0 && k==0) { vMin=vP; vMax=vP; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
else // :( need true clipping
for(f=0; f<6; f++)
float3 q0, q1, q2, q3;
GetQuad(q0, q1, q2, q3, boxX, boxY, boxZ, center, scaleXY, f);
// 4 vertices to a quad of the convex hull in post projection space
const float4 vP0 = mul(g_mProjection, float4(q0, 1));
const float4 vP1 = mul(g_mProjection, float4(q1, 1));
const float4 vP2 = mul(g_mProjection, float4(q2, 1));
const float4 vP3 = mul(g_mProjection, float4(q3, 1));
int iSrcIndex = 0;
int offs = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
// fill up source clip buffer with the quad
posX[offs+0]=vP0.x; posX[offs+1]=vP1.x; posX[offs+2]=vP2.x; posX[offs+3]=vP3.x;
posY[offs+0]=vP0.y; posY[offs+1]=vP1.y; posY[offs+2]=vP2.y; posY[offs+3]=vP3.y;
posZ[offs+0]=vP0.z; posZ[offs+1]=vP1.z; posZ[offs+2]=vP2.z; posZ[offs+3]=vP3.z;
posW[offs+0]=vP0.w; posW[offs+1]=vP1.w; posW[offs+2]=vP2.w; posW[offs+3]=vP3.w;
int iNrSrcVerts = 4;
// do true clipping
for(int p=0; p<6; p++)
const int nrVertsDst = ClipAgainstPlane(iSrcIndex, iNrSrcVerts, subLigt, p);
iSrcIndex = 1-iSrcIndex;
iNrSrcVerts = nrVertsDst;
if(iNrSrcVerts<3 || iNrSrcVerts>=MAX_PNTS) break;
// final clipped convex primitive is in src buffer
int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
for(int k=0; k<iNrSrcVerts; k++)
float4 vCur = float4(posX[offs_src+k], posY[offs_src+k], posZ[offs_src+k], posW[offs_src+k]);
// project and apply toward AABB
float3 vP = float3(vCur.x/vCur.w, vCur.y/vCur.w, vCur.z/vCur.w);
if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
////////////////////// look for camera frustum verts that need to be included. That is frustum vertices inside the convex hull for the light
int i=0;
for(i=0; i<8; i++) // establish 8 camera frustum vertices
float3 vVertPSpace = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
float4 v4ViewSpace = mul(g_mInvProjection, float4(vVertPSpace,1));
float3 vViewSpace = float3(v4ViewSpace.x/v4ViewSpace.w, v4ViewSpace.y/v4ViewSpace.w, v4ViewSpace.z/v4ViewSpace.w);
posX[subLigt*MAX_PNTS*2 + i] = vViewSpace.x;
posY[subLigt*MAX_PNTS*2 + i] = vViewSpace.y;
posZ[subLigt*MAX_PNTS*2 + i] = vViewSpace.z;
// determine which camera frustum vertices are inside the convex hull
uint uVisibFl = 0xff;
for(f=0; f<6; f++)
float3 vP0, vN;
GetPlane(vP0, vN, boxX, boxY, boxZ, center, scaleXY, f);
for(i=0; i<8; i++)
float3 vViewSpace = float3(posX[subLigt*MAX_PNTS*2 + i], posY[subLigt*MAX_PNTS*2 + i], posZ[subLigt*MAX_PNTS*2 + i]);
uVisibFl &= ( dot(vViewSpace-vP0, vN)<0 ? 0xff : (~(1<<i)) );
// apply camera frustum vertices inside the convex hull to the AABB
for(i=0; i<8; i++)
float3 vP = float3((i&1)!=0 ? 1 : (-1), (i&2)!=0 ? 1 : (-1), (i&4)!=0 ? 1 : 0);
if(!bSetBoundYet) { vMin=vP; vMax=vP; bSetBoundYet=true; }
vMax = max(vMax, vP); vMin = min(vMin, vP);
// determine AABB bound in [-1;1]x[-1;1] screen space using bounding sphere.
// Use the result to make our already established AABB from the convex hull
// potentially tighter.
// set the AABB off-screen
vMin = float3(-3,-3,-3);
vMax = float3(-2,-2,-2);
if( length(center)>radius)
float2 vMi, vMa;
bool2 bMi, bMa;
CalcBound(bMi, bMa, vMi, vMa, g_mInvProjection, center, radius);
vMin.xy = bMi ? max(vMin.xy, vMi) : vMin.xy;
vMax.xy = bMa ? min(vMax.xy, vMa) : vMax.xy;
float4 vPosF = mul(g_mProjection, float4(0,0,center.z-radius,1));
vMin.z = max(vMin.z, vPosF.z/vPosF.w);
float4 vPosB = mul(g_mProjection, float4(0,0,center.z+radius,1));
vMax.z = min(vMax.z, vPosB.z/vPosB.w);
float4 vPosF = mul(g_mProjection, float4(0,0,center.z+radius,1));
vMin.z = max(vMin.z, vPosF.z/vPosF.w);
float4 vPosB = mul(g_mProjection, float4(0,0,center.z-radius,1));
vMax.z = min(vMax.z, vPosB.z/vPosB.w);
vMin = float3(-3,-3,-3);
vMax = float3(-2,-2,-2);
// we should consider doing a look-up here into a max depth mip chain
// to see if the light is occluded: vMin.z*VIEWPORT_SCALE_Z > MipTexelMaxDepth
//g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, -0.5*vMax.y+0.5, vMin.z*VIEWPORT_SCALE_Z);
//g_vBoundsBuffer[lgtIndex+g_iNrVisibLights] = float3(0.5*vMax.x+0.5, -0.5*vMin.y+0.5, vMax.z*VIEWPORT_SCALE_Z);
// changed for unity
g_vBoundsBuffer[lgtIndex+0] = float3(0.5*vMin.x+0.5, 0.5*vMin.y+0.5, vMin.z*VIEWPORT_SCALE_Z);
g_vBoundsBuffer[lgtIndex+(int) g_iNrVisibLights] = float3(0.5*vMax.x+0.5, 0.5*vMax.y+0.5, vMax.z*VIEWPORT_SCALE_Z);
float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p);
int ClipAgainstPlane(const int iSrcIndex, const int iNrSrcVerts, const int subLigt, const int p)
int offs_src = iSrcIndex*MAX_PNTS+subLigt*MAX_PNTS*2;
int offs_dst = (1-iSrcIndex)*MAX_PNTS+subLigt*MAX_PNTS*2;
float4 vPrev = float4(posX[offs_src+(iNrSrcVerts-1)], posY[offs_src+(iNrSrcVerts-1)], posZ[offs_src+(iNrSrcVerts-1)], posW[offs_src+(iNrSrcVerts-1)]);
int nrVertsDst = 0;
unsigned int uMask = (1<<p);
bool bIsPrevVisib = (GetClip(vPrev)&uMask)==0;
for(int i=0; i<iNrSrcVerts; i++)
float4 vCur = float4(posX[offs_src+i], posY[offs_src+i], posZ[offs_src+i], posW[offs_src+i]);
bool bIsCurVisib = (GetClip(vCur)&uMask)==0;
if( (bIsCurVisib && !bIsPrevVisib) || (!bIsCurVisib && bIsPrevVisib) )
// generate new vertex
float4 vNew = GenNewVert(bIsCurVisib ? vCur : vPrev, bIsCurVisib ? vPrev : vCur, p);
posX[offs_dst+nrVertsDst]=vNew.x; posY[offs_dst+nrVertsDst]=vNew.y; posZ[offs_dst+nrVertsDst]=vNew.z; posW[offs_dst+nrVertsDst]=vNew.w;
posX[offs_dst+nrVertsDst]=vCur.x; posY[offs_dst+nrVertsDst]=vCur.y; posZ[offs_dst+nrVertsDst]=vCur.z; posW[offs_dst+nrVertsDst]=vCur.w;
vPrev = vCur;
bIsPrevVisib = bIsCurVisib;
return nrVertsDst;
unsigned int GetClip(const float4 P)
//-P.w <= P.x <= P.w
return ((P.x<-P.w)?1:0) | ((P.x>P.w)?2:0) | ((P.y<-P.w)?4:0) | ((P.y>P.w)?8:0) | ((P.z<0)?16:0) | ((P.z>P.w)?32:0);
float4 GenNewVert(const float4 vVisib, const float4 vInvisib, const int p)
const float fS = p==4 ? 0 : ((p&1)==0 ? -1 : 1);
const int index = ((uint) p)/2;
float x1 = index==0 ? vVisib.x : (index==1 ? vVisib.y : vVisib.z);
float x0 = index==0 ? vInvisib.x : (index==1 ? vInvisib.y : vInvisib.z);
//fS*((vVisib.w-vInvisib.w)*t + vInvisib.w) = (x1-x0)*t + x0;
const float fT = (fS*vInvisib.w-x0)/((x1-x0) - fS*(vVisib.w-vInvisib.w));
float4 vNew = vVisib*fT + vInvisib*(1-fT);
// just to be really anal we make sure the clipped against coordinate is precise
if(index==0) vNew.x = fS*vNew.w;
else if(index==1) vNew.y = fS*vNew.w;
else vNew.z = fS*vNew.w;
return vNew;
float4 TransformPlaneToPostSpace(float4x4 InvProjection, float4 plane)
return mul(plane, InvProjection);
float4 EvalPlanePair(float2 posXY_in, float r)
// rotate by 90 degrees to avoid potential division by zero
bool bMustFlip = abs(posXY_in.y)<abs(posXY_in.x);
float2 posXY = bMustFlip ? float2(-posXY_in.y, posXY_in.x) : posXY_in;
float fLenSQ = dot(posXY, posXY);
float D = posXY.y * sqrt(fLenSQ - r*r);
float4 res;
res.x = (-r*posXY.x - D) / fLenSQ;
res.z = (-r*posXY.x + D) / fLenSQ;
res.y = (-r-res.x*posXY.x) / posXY.y;
res.w = (-r-res.z*posXY.x) / posXY.y;
// rotate back by 90 degrees
res = bMustFlip ? float4(res.y, -res.x, res.w, -res.z) : res;
return res;
void CalcBound(out bool2 bIsMinValid, out bool2 bIsMaxValid, out float2 vMin, out float2 vMax, float4x4 InvProjection, float3 pos_view_space, float r)
float4 planeX = EvalPlanePair(float2(pos_view_space.x, pos_view_space.z), r);
float4 planeY = EvalPlanePair(float2(pos_view_space.y, pos_view_space.z), r);
planeX = planeX.zwxy; // need to swap left/right and top/bottom planes when using left hand system
planeY = planeY.zwxy;
bIsMinValid = bool2(planeX.z<0, planeY.z<0);
bIsMaxValid = bool2((-planeX.x)<0, (-planeY.x)<0);
// hopefully the compiler takes zeros into account
// should be the case since the transformation in TransformPlaneToPostSpace()
// is done using multiply-adds and not dot product instructions.
float4 planeX0 = TransformPlaneToPostSpace(InvProjection, float4(planeX.x, 0, planeX.y, 0));
float4 planeX1 = TransformPlaneToPostSpace(InvProjection, float4(planeX.z, 0, planeX.w, 0));
float4 planeY0 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.x, planeY.y, 0));
float4 planeY1 = TransformPlaneToPostSpace(InvProjection, float4(0, planeY.z, planeY.w, 0));
// convert planes to the forms (1,0,0,D) and (0,1,0,D)
// 2D bound is given by -D components
float2 A = -float2(planeX0.w / planeX0.x, planeY0.w / planeY0.y);
float2 B = -float2(planeX1.w / planeX1.x, planeY1.w / planeY1.y);
// Bound is complete
vMin = B;
vMax = A;


fileFormatVersion: 2
guid: 728dce960f8a9c44bbc3abb3b851d8f6
timeCreated: 1479306737
licenseType: Pro
currentAPIMask: 4