907 lines
39 KiB
Plaintext
907 lines
39 KiB
Plaintext
// #pragma enable_d3d11_debug_symbols
|
|
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
|
|
|
|
#pragma kernel main
|
|
|
|
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
|
|
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
|
|
|
|
/* ------------------------------ Inputs ------------------------------------ */
|
|
|
|
StructuredBuffer<SFiniteLightBound> g_data : register(t0);
|
|
|
|
/* ------------------------------ Outputs ----------------------------------- */
|
|
|
|
RWStructuredBuffer<float4> g_vBoundsBuffer : register(u0);
|
|
|
|
/* ------------------------------ Utilities --------------------------------- */
|
|
|
|
// Returns the location of the N-th set bit starting from the lowest order bit and working upward.
|
|
// Slow implementation - do not use for large bit sets.
|
|
// Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html
|
|
uint NthBitLow(uint value, uint n)
|
|
{
|
|
uint b = -1; // Consistent with the behavior of firstbitlow()
|
|
uint c = countbits(value);
|
|
|
|
if (n < c) // Validate inputs
|
|
{
|
|
uint r = n + 1; // Compute the number of remaining bits
|
|
|
|
do
|
|
{
|
|
uint f = firstbitlow(value >> (b + 1)); // Find the next set bit
|
|
b += f + r; // Make a guess (assume all [b+f+1,b+f+r] bits are set)
|
|
c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set
|
|
r = (n + 1) - c; // Compute the number of remaining bits
|
|
} while (r > 0);
|
|
}
|
|
|
|
return b;
|
|
}
|
|
|
|
float4x4 Translation4x4(float3 d)
|
|
{
|
|
float4x4 M = k_identity4x4;
|
|
|
|
M._14_24_34 = d; // Last column
|
|
|
|
return M;
|
|
}
|
|
|
|
// Scale followed by rotation (scaled axes).
|
|
float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
|
|
{
|
|
float3x3 R = float3x3(xAxis, yAxis, zAxis);
|
|
float3x3 C = transpose(R); // Row to column
|
|
|
|
return C;
|
|
}
|
|
|
|
float3x3 Invert3x3(float3x3 R)
|
|
{
|
|
float3x3 C = transpose(R); // Row to column
|
|
float det = dot(C[0], cross(C[1], C[2]));
|
|
float3x3 adj = float3x3(cross(C[1], C[2]),
|
|
cross(C[2], C[0]),
|
|
cross(C[0], C[1]));
|
|
return rcp(det) * adj;
|
|
}
|
|
|
|
float4x4 Homogenize3x3(float3x3 R)
|
|
{
|
|
float4x4 M = float4x4(float4(R[0], 0),
|
|
float4(R[1], 0),
|
|
float4(R[2], 0),
|
|
float4(0,0,0,1));
|
|
return M;
|
|
}
|
|
|
|
float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
|
|
{
|
|
float b = (f + n) * rcp(f - n);
|
|
float c = -2 * f * n * rcp(f - n);
|
|
|
|
return float4x4(g/a, 0, 0, 0,
|
|
0, g, 0, 0, // No Y-flip
|
|
0, 0, b, c, // Z in [-1, 1], no Z-reversal
|
|
0, 0, 1, 0); // No W-flip
|
|
}
|
|
|
|
/* ------------------------------ Implementation ---------------------------- */
|
|
|
|
// !!! IMPORTANT !!!
|
|
// The legacy code from Morten provides us special projection matrices (and their inverses).
|
|
// These matrices are different from the matrices the HDRP uses.
|
|
// There is no reversed-Z buffering (effectively, forced UNITY_REVERSED_Z = 0).
|
|
// Additionally, there is no clip-space flip (effectively, forced UNITY_UV_STARTS_AT_TOP = 0).
|
|
// Therefore, all coordinate systems are left-handed, Y-up, without W-flip.
|
|
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
|
|
// y z
|
|
// | /
|
|
// 0 -- x
|
|
|
|
// Improve the quality of generated code at the expense of readability.
|
|
// Remove when the shader compiler is clever enough to perform this optimization for us.
|
|
#define OBTUSE_COMPILER
|
|
|
|
#ifdef SHADER_API_XBOXONE
|
|
// The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant.
|
|
// In our case, the mask is a compile-time constant, but it is defined inside a loop
|
|
// that is unrolled at the compile time, and the constants are generated during the
|
|
// constant propagation pass of the optimizer. This works fine on PlayStation, but does not work
|
|
// on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support
|
|
// of wave intrinsics on Xbox until the Xbox compiler is fixed.
|
|
#undef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
#endif
|
|
|
|
|
|
#define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX)
|
|
#define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
|
|
|
|
// Clipping a plane by a cube may produce a hexagon (6-gon).
|
|
// Clipping a hexagon by 4 planes may produce a decagon (10-gon).
|
|
#define MAX_CLIP_VERTS (10)
|
|
#define NUM_VERTS (8)
|
|
#define NUM_FACES (6)
|
|
#define NUM_PLANES (6)
|
|
#define THREADS_PER_GROUP (64)
|
|
#define THREADS_PER_LIGHT (4) // Set to 1 for debugging
|
|
#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT)
|
|
#define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP)
|
|
#define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT)
|
|
#define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
|
|
|
|
// All planes and faces are always in the standard order (see below).
|
|
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
|
|
#define FACE_LEFT (1 << 0) // -X
|
|
#define FACE_RIGHT (1 << 1) // +X
|
|
#define FACE_BOTTOM (1 << 2) // -Y
|
|
#define FACE_TOP (1 << 3) // +Y
|
|
#define FACE_FRONT (1 << 4) // -Z
|
|
#define FACE_BACK (1 << 5) // +Z
|
|
#define FACE_MASK ((1 << NUM_FACES) - 1)
|
|
|
|
// A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB).
|
|
#define VERT_LIST_LEFT ((4) << 9 | (6) << 6 | (2) << 3 | (0) << 0)
|
|
#define VERT_LIST_RIGHT ((3) << 9 | (7) << 6 | (5) << 3 | (1) << 0)
|
|
#define VERT_LIST_BOTTOM ((1) << 9 | (5) << 6 | (4) << 3 | (0) << 0)
|
|
#define VERT_LIST_TOP ((6) << 9 | (7) << 6 | (3) << 3 | (2) << 0)
|
|
#define VERT_LIST_FRONT ((2) << 9 | (3) << 6 | (1) << 3 | (0) << 0)
|
|
#define VERT_LIST_BACK ((5) << 9 | (7) << 6 | (6) << 3 | (4) << 0)
|
|
|
|
// All vertices are always in the standard order (see below).
|
|
uint GetFaceMaskOfVertex(uint v)
|
|
{
|
|
// 0: (-1, -1, -1) -> { FACE_LEFT | FACE_BOTTOM | FACE_FRONT }
|
|
// 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT }
|
|
// 2: (-1, +1, -1) -> { FACE_LEFT | FACE_TOP | FACE_FRONT }
|
|
// 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_TOP | FACE_FRONT }
|
|
// 4: (-1, -1, +1) -> { FACE_LEFT | FACE_BOTTOM | FACE_BACK }
|
|
// 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK }
|
|
// 6: (-1, +1, +1) -> { FACE_LEFT | FACE_TOP | FACE_BACK }
|
|
// 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_TOP | FACE_BACK }
|
|
// ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32)
|
|
uint f = (FACE_LEFT << BitFieldExtract(v, 0, 1))
|
|
| (FACE_BOTTOM << BitFieldExtract(v, 1, 1))
|
|
| (FACE_FRONT << BitFieldExtract(v, 2, 1));
|
|
|
|
return f;
|
|
};
|
|
|
|
float3 GenerateVertexOfStandardCube(uint v)
|
|
{
|
|
float3 p;
|
|
|
|
p.x = ((v & 1) == 0) ? -1 : 1; // FACE_LEFT : FACE_RIGHT
|
|
p.y = ((v & 2) == 0) ? -1 : 1; // FACE_BOTTOM : FACE_TOP
|
|
p.z = ((v & 4) == 0) ? -1 : 1; // FACE_FRONT : FACE_BACK
|
|
|
|
return p;
|
|
}
|
|
|
|
uint GetVertexListOfFace(uint f)
|
|
{
|
|
// Warning: don't add 'static' here unless you want really bad code gen.
|
|
const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT,
|
|
(VERT_LIST_TOP << 12) | VERT_LIST_BOTTOM,
|
|
(VERT_LIST_BACK << 12) | VERT_LIST_FRONT);
|
|
|
|
return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
|
|
}
|
|
|
|
// 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
|
|
groupshared float gs_HapVertsX[VERTS_PER_GROUP];
|
|
groupshared float gs_HapVertsY[VERTS_PER_GROUP];
|
|
groupshared float gs_HapVertsZ[VERTS_PER_GROUP];
|
|
groupshared float gs_HapVertsW[VERTS_PER_GROUP];
|
|
groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types)
|
|
|
|
#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
// 1 array * 16 elements * 4 bytes each = 64 bytes.
|
|
groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types)
|
|
|
|
// 8 arrays * 16 elements * 4 bytes each = 512 bytes.
|
|
// These are actually floats reinterpreted as uints.
|
|
// The reason is because floating-point atomic operations are not supported.
|
|
groupshared uint gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP];
|
|
groupshared uint gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP];
|
|
groupshared uint gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP];
|
|
groupshared uint gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP];
|
|
groupshared uint gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed
|
|
groupshared uint gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique.
|
|
groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
|
|
groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
|
|
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
|
|
|
|
// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling
|
|
|
|
groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
|
|
groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
|
|
groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
|
|
groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];
|
|
|
|
float4 GetFromRingBuffer(uint threadIdx, uint entry)
|
|
{
|
|
float4 outV;
|
|
outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
|
|
outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
|
|
outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
|
|
outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
|
|
return outV;
|
|
}
|
|
|
|
void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
|
|
{
|
|
gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
|
|
gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
|
|
gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
|
|
gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
|
|
}
|
|
/////////////////////////////////////////////////////////
|
|
|
|
|
|
// Returns 'true' if it manages to cull the face.
|
|
bool TryCullFace(uint f, uint baseOffsetVertex)
|
|
{
|
|
uint cullMaskOfFace = FACE_MASK; // Initially behind
|
|
uint vertListOfFace = GetVertexListOfFace(f);
|
|
|
|
for (uint j = 0; j < 4; j++)
|
|
{
|
|
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
|
|
// Non-zero if ALL the vertices are behind any of the planes.
|
|
cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
|
|
}
|
|
|
|
return (cullMaskOfFace != 0);
|
|
}
|
|
|
|
struct ClipVertex
|
|
{
|
|
float4 pt; // Homogeneous coordinate after perspective
|
|
float bc; // Boundary coordinate with respect to the plane 'p'
|
|
};
|
|
|
|
ClipVertex CreateClipVertex(uint p, float4 v)
|
|
{
|
|
bool evenPlane = (p & 1) == 0;
|
|
|
|
float c = v[p >> 1];
|
|
float w = v.w;
|
|
|
|
ClipVertex cv;
|
|
|
|
cv.pt = v;
|
|
cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex);
|
|
|
|
return cv;
|
|
}
|
|
|
|
float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
|
|
{
|
|
float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1
|
|
|
|
return lerp(v0.pt, v1.pt, alpha);
|
|
}
|
|
|
|
void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
|
|
uint threadIdx,
|
|
out uint dstBegin, out uint dstSize)
|
|
{
|
|
dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
|
|
dstSize = 0;
|
|
|
|
ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));
|
|
|
|
#ifdef OBTUSE_COMPILER
|
|
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
|
|
uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
|
|
#endif
|
|
|
|
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
|
|
{
|
|
#ifndef OBTUSE_COMPILER
|
|
uint modSrcIdx = j % MAX_CLIP_VERTS;
|
|
#endif
|
|
ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx));
|
|
|
|
// Execute Blinn's line clipping algorithm.
|
|
// Classify the line segment. 4 cases:
|
|
// 0. v0 out, v1 out -> add nothing
|
|
// 1. v0 in, v1 out -> add intersection
|
|
// 2. v0 out, v1 in -> add intersection, add v1
|
|
// 3. v0 in, v1 in -> add v1
|
|
// (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0.
|
|
|
|
if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
|
|
{
|
|
// The line segment is guaranteed to cross the plane.
|
|
float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert);
|
|
#ifndef OBTUSE_COMPILER
|
|
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
|
|
#endif
|
|
WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
|
|
#ifdef OBTUSE_COMPILER
|
|
dstSize++;
|
|
modDstIdx++;
|
|
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
|
|
#endif
|
|
}
|
|
|
|
if (leadVert.bc >= 0)
|
|
{
|
|
#ifndef OBTUSE_COMPILER
|
|
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
|
|
#endif
|
|
WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);
|
|
|
|
//vertRingBuffer[modDstIdx] = leadVert.pt;
|
|
#ifdef OBTUSE_COMPILER
|
|
dstSize++;
|
|
modDstIdx++;
|
|
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
|
|
#endif
|
|
}
|
|
|
|
#ifdef OBTUSE_COMPILER
|
|
modSrcIdx++;
|
|
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
|
|
#endif
|
|
tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer
|
|
}
|
|
}
|
|
|
|
void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
|
|
out uint srcBegin, out uint srcSize,
|
|
uint threadIdx)
|
|
{
|
|
srcBegin = 0;
|
|
srcSize = 4;
|
|
|
|
uint clipMaskOfFace = 0; // Initially in front
|
|
uint vertListOfFace = GetVertexListOfFace(f);
|
|
|
|
for (uint j = 0; j < 4; j++)
|
|
{
|
|
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
|
|
// Non-zero if ANY of the vertices are behind any of the planes.
|
|
clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];
|
|
|
|
// Not all edges may require clipping. However, filtering the vertex list
|
|
// is somewhat expensive, so we currently don't do it.
|
|
WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
|
|
//vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
|
|
//vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
|
|
//vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
|
|
//vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
|
|
}
|
|
|
|
// Sutherland-Hodgeman polygon clipping algorithm.
|
|
// It works by clipping the entire polygon against one clipping plane at a time.
|
|
while (clipMaskOfFace != 0)
|
|
{
|
|
uint p = firstbitlow(clipMaskOfFace);
|
|
|
|
uint dstBegin, dstSize;
|
|
ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);
|
|
|
|
srcBegin = dstBegin;
|
|
srcSize = dstSize;
|
|
|
|
clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
|
|
}
|
|
}
|
|
|
|
void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
|
|
bool isOrthoProj, float4x4 invProjMat,
|
|
inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
|
|
{
|
|
#ifdef OBTUSE_COMPILER
|
|
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
|
|
#endif
|
|
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
|
|
{
|
|
#ifndef OBTUSE_COMPILER
|
|
uint modSrcIdx = j % MAX_CLIP_VERTS;
|
|
#endif
|
|
float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
|
|
// Clamp to the bounds in case of numerical errors (may still generate -0).
|
|
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
|
|
float rbpVertVSz = hapVert.w;
|
|
|
|
if (isOrthoProj) // Must replace (w = 1)
|
|
{
|
|
rbpVertVSz = dot(invProjMat[2], hapVert);
|
|
}
|
|
|
|
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
|
|
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
|
|
#ifdef OBTUSE_COMPILER
|
|
modSrcIdx++;
|
|
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// Given: 'C' is the center of the sphere in the view space, 'r' is its radius;
|
|
// 'projScale' and 'projOffset' are used to perform projection of the X (or Y) component of a vector.
|
|
float2 ComputeBoundsOfSphereOnProjectivePlane(float3 C, float r, float projScale, float projOffset)
|
|
{
|
|
float xMin, xMax;
|
|
|
|
// See sec. 8.2.1 of https://foundationsofgameenginedev.com/#fged2 for an alternative derivation.
|
|
// Goal: find the planes that pass through the origin O, bound the sphere, and form
|
|
// an axis-aligned rectangle at the intersection with the projection plane.
|
|
// Solution (for the X-coordinate):
|
|
// The intersection of the bounding planes and the projection plane must be vertical lines,
|
|
// which means that the bounding planes must be tangent to the Y-axis.
|
|
// The bounding planes must be also tangent to the sphere.
|
|
// Call the intersection points of the two vertical bounding planes and the bounding
|
|
// sphere B and D. Assume that B is on the left of C; D is on the right of C.
|
|
// Note that C may be behind the origin, so the same generally goes for B and D.
|
|
// BC is normal w.r.t. the bounding plane, so it is normal w.r.t. the Y-axis; |BC| = r.
|
|
// As a consequence, it lies in a plane parallel to the the O-X-Z plane.
|
|
// Consider B'C', which is an orthogonal projection of BC onto the actual O-X-Z plane.
|
|
// (Imagine sliding the sphere up or down between the bounding planes).
|
|
// We then consider a triangle OB'C' that lies entirely in the O-X-Z plane.
|
|
// The coordinates are: OB' = (b.x, 0, b.z), OC' = (c.x, 0, c.z).
|
|
float3 B, D;
|
|
// OBC is a right triangle. So is OB'C'.
|
|
// |BC| = |B'C'| = r.
|
|
// |OB'|^2 = |OC'|^2 - |B'C'|^2.
|
|
float lenSqOC = dot(C.xz, C.xz);
|
|
float lenSqOB = lenSqOC - r * r;
|
|
// If |OB'| = 0 or |OC'| = 0, the bounding planes tangent to the sphere do not exist.
|
|
if (lenSqOB > 0)
|
|
{
|
|
float lenOB = sqrt(lenSqOB);
|
|
// |OB' x OC'| = |OB'| * |OC'| * Sin[a'].
|
|
// OB' . OC' = |OB'| * |OC'| * Cos[a'].
|
|
// We can determine Sin[a'] = |B'C'| / |OC'| = R / |OC'|.
|
|
// Cos[a'] = Sqrt[1 - Sin[a']^2].
|
|
// (OB' x OC') points along Y.
|
|
// (OB' x OC').y = b.z * c.x - b.x * c.z.
|
|
// Therefore, b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a'].
|
|
// OB' . OC' = b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a'].
|
|
// Since we don't care about the scale, and |OB'| != 0 and |OC'| != 0,
|
|
// we can equivalently solve
|
|
// z * c.x - x * c.z = |OC'|^3 * Sin[a'].
|
|
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
|
|
// With 2 equations and 2 unknowns, we can easily solve this linear system.
|
|
// The solutions is
|
|
// x = -c.z * r + c.x * |OB'|.
|
|
// z = c.x * r + c.z * |OB'|.
|
|
B.x = C.x * lenOB - (C.z * r);
|
|
B.z = C.z * lenOB + (C.x * r);
|
|
// (OD' x OC') points along Y.
|
|
// (OD' x OC').y = d.z * c.x - d.x * c.z.
|
|
// We must solve
|
|
// z * c.x - x * c.z = -|OC'|^3 * Sin[a'].
|
|
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
|
|
// The solution is
|
|
// x = c.z * r + c.x * |OB'|.
|
|
// z = -c.x * r + c.z * |OB'|.
|
|
D.x = C.x * lenOB + (C.z * r);
|
|
D.z = C.z * lenOB - (C.x * r);
|
|
// We can transform OB and OD as direction vectors.
|
|
// For the simplification below, see OptimizeProjectionMatrix.
|
|
float rapBx = (B.x * rcp(B.z)) * projScale + projOffset;
|
|
float rapDx = (D.x * rcp(D.z)) * projScale + projOffset;
|
|
// One problem with the above is that this direction may, for certain spheres,
|
|
// point behind the origin (B.z <= 0 or D.z <= 0).
|
|
// At this point we know that the sphere at least *partially* in front of the origin,
|
|
// and that it is we are not inside the sphere, so there is at least one valid
|
|
// plane (and one valid direction). We just need the second direction to go "in front"
|
|
// of the first one to extend the bounding box.
|
|
xMin = (B.z > 0) ? rapBx : -FLT_INF;
|
|
xMax = (D.z > 0) ? rapDx : FLT_INF;
|
|
}
|
|
else
|
|
{
|
|
// Conservative estimate (we do not cull the bounding sphere using the view frustum).
|
|
xMin = -1;
|
|
xMax = 1;
|
|
}
|
|
|
|
return float2(xMin, xMax);
|
|
}
|
|
|
|
//**********************************************************************************************
|
|
// The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
|
|
// The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
|
|
//
|
|
// Since a light volume may be partially off-screen, we must clip it before computing the AABB.
|
|
// Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
|
|
//
|
|
// To avoid having to deal with the "Moebius twist" property of the perspective transform,
|
|
// we perform clipping using the homogeneous (projective) post-perspective coordinates.
|
|
// This clipping method in described in Blinn's paper titled "Line Clipping".
|
|
//
|
|
// The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
|
|
// worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
|
|
// Some faces may require culling rather than clipping (the former is simpler).
|
|
//
|
|
// It's important to realize that face culling may end up culling 5 (or even all 6) faces.
|
|
// This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
|
|
// (Imagine a view volume completely or partially inside a light volume).
|
|
// Therefore, we must perform view-volume-corner-inside-light-volume tests.
|
|
//
|
|
//
|
|
// Notation:
|
|
// rbp - real (3D) coordinates before perspective
|
|
// hbp - hom. (4D) coordinates before perspective
|
|
// hap - hom. (4D) coordinates after perspective
|
|
// rap - real (3D) coordinates after perspective (after division by w)
|
|
// *********************************************************************************************
|
|
|
|
[numthreads(THREADS_PER_GROUP, 1, 1)]
|
|
void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
|
|
{
|
|
const uint t = threadID;
|
|
const uint g = groupID.x;
|
|
const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1
|
|
|
|
const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
|
|
const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
|
|
const uint baseVertexOffset = intraGroupLightIndex * NUM_VERTS;
|
|
|
|
const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
|
|
const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset];
|
|
|
|
const float4x4 projMat = g_mProjectionArr[eyeIndex];
|
|
const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];
|
|
|
|
// Bounding frustum.
|
|
const float3 rbpC = cullData.center.xyz; // View-space
|
|
const float3 rbpX = cullData.boxAxisX.xyz; // Pre-scaled
|
|
const float3 rbpY = cullData.boxAxisY.xyz; // Pre-scaled
|
|
const float3 rbpZ = cullData.boxAxisZ.xyz; // Pre-scaled
|
|
const float scale = cullData.scaleXY; // scale.x = scale.y
|
|
// Bounding sphere.
|
|
const float radius = cullData.radius;
|
|
|
|
#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
// (0) Initialize the TGSM.
|
|
if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
|
|
{
|
|
gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside
|
|
gs_NdcAaBbMinPtX[intraGroupLightIndex] = asuint(1.0f);
|
|
gs_NdcAaBbMaxPtX[intraGroupLightIndex] = asuint(0.0f);
|
|
gs_NdcAaBbMinPtY[intraGroupLightIndex] = asuint(1.0f);
|
|
gs_NdcAaBbMaxPtY[intraGroupLightIndex] = asuint(0.0f);
|
|
gs_NdcAaBbMinPtZ[intraGroupLightIndex] = asuint(1.0f);
|
|
gs_NdcAaBbMaxPtZ[intraGroupLightIndex] = asuint(0.0f);
|
|
gs_NdcAaBbMinPtW[intraGroupLightIndex] = asuint(FLT_INF);
|
|
gs_NdcAaBbMaxPtW[intraGroupLightIndex] = asuint(0.0f);
|
|
}
|
|
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
|
|
float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF);
|
|
float4 ndcAaBbMaxPt = 0;
|
|
|
|
// We must determine whether we have to clip or cull any of the faces.
|
|
// If all vertices of a face are inside with respect to all the culling planes,
|
|
// we can trivially accept that face. If all vertices of a face are behind
|
|
// any single plane, we can trivially reject (cull) that face.
|
|
uint cullClipFaceMask = 0; // Initially inside
|
|
|
|
uint i; // Avoid multiply-declared variable warning
|
|
|
|
// (1) Compute the vertices of the light volume.
|
|
for (i = 0; i < VERTS_PER_THREAD; i++)
|
|
{
|
|
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
|
|
|
|
// rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1)
|
|
// rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1)
|
|
// rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1)
|
|
// rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1)
|
|
// rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; (-1, -1, +1)
|
|
// rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; (+1, -1, +1)
|
|
// rbpVerts[6] = rbpC - rbpX + rbpY + rbpZ; (-1, +1, +1)
|
|
// rbpVerts[7] = rbpC + rbpX + rbpY + rbpZ; (+1, +1, +1)
|
|
|
|
float3 m = GenerateVertexOfStandardCube(v);
|
|
m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale]
|
|
|
|
float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
|
|
// Avoid generating (w = 0).
|
|
rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;
|
|
|
|
float4 hapVert = mul(projMat, float4(rbpVertVS, 1));
|
|
|
|
// Warning: the W component may be negative.
|
|
// Flipping the -W pyramid by negating all coordinates is incorrect
|
|
// and will break both classification and clipping.
|
|
// For the orthographic projection, (w = 1).
|
|
|
|
// Transform the X and Y components: [-w, w] -> [0, w].
|
|
hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
|
|
|
|
// For each vertex, we must determine whether it is within the bounds.
|
|
// For culling and clipping, we must know, per culling plane, whether the vertex
|
|
// is in the positive or the negative half-space.
|
|
uint behindMask = 0; // Initially in front
|
|
|
|
// Consider the vertex to be inside the view volume if:
|
|
// 0 <= x <= w
|
|
// 0 <= y <= w <-- include boundary points to avoid clipping them later
|
|
// 0 <= z <= w
|
|
// w is always valid
|
|
// TODO: epsilon for numerical robustness?
|
|
|
|
for (uint j = 0; j < (NUM_PLANES / 2); j++)
|
|
{
|
|
float w = hapVert.w;
|
|
|
|
behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
|
|
behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
|
|
}
|
|
|
|
if (behindMask == 0) // Inside?
|
|
{
|
|
// Clamp to the bounds in case of numerical errors (may still generate -0).
|
|
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
|
|
|
|
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z));
|
|
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z));
|
|
}
|
|
else // Outside
|
|
{
|
|
// Mark all the faces of the bounding frustum associated with this vertex.
|
|
cullClipFaceMask |= GetFaceMaskOfVertex(v);
|
|
}
|
|
|
|
gs_HapVertsX[baseVertexOffset + v] = hapVert.x;
|
|
gs_HapVertsY[baseVertexOffset + v] = hapVert.y;
|
|
gs_HapVertsZ[baseVertexOffset + v] = hapVert.z;
|
|
gs_HapVertsW[baseVertexOffset + v] = hapVert.w;
|
|
gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask;
|
|
}
|
|
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
|
|
{
|
|
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
|
|
uint orMask = 0; // Plays no role
|
|
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
|
|
|
|
cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
|
|
}
|
|
#else
|
|
InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
|
|
#endif
|
|
|
|
// (2) Test the corners of the view volume.
|
|
if (cullClipFaceMask != 0)
|
|
{
|
|
// The light is partially outside the view volume.
|
|
// Therefore, some of the corners of the view volume may be inside the light volume.
|
|
// We perform aggressive culling, so we must make sure they are accounted for.
|
|
// The light volume is a special type of cuboid - a right frustum.
|
|
// We can exploit this fact by building a light-space projection matrix.
|
|
// P_v = T * (R * S) * P_l
|
|
// P_l = (R * S)^{-1} * T^{-1} * P_v
|
|
float4x4 invTranslateToLightSpace = Translation4x4(-rbpC);
|
|
float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ)));
|
|
// TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
|
|
|
|
// This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
|
|
float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);
|
|
|
|
if (scale != 1) // Perspective light space?
|
|
{
|
|
// Compute the parameters of the perspective projection.
|
|
float s = scale;
|
|
float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye
|
|
float n = -e - 1; // Distance from the eye to the near plane
|
|
float f = -e + 1; // Distance from the eye to the far plane
|
|
float g = f; // Distance from the eye to the projection plane
|
|
|
|
float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
|
|
float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);
|
|
|
|
lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
|
|
}
|
|
|
|
for (i = 0; i < VERTS_PER_THREAD; i++)
|
|
{
|
|
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
|
|
|
|
float3 rapVertCS = GenerateVertexOfStandardCube(v);
|
|
rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]
|
|
|
|
float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space
|
|
float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space
|
|
|
|
// Consider the vertex to be inside the light volume if:
|
|
// -w < x < w
|
|
// -w < y < w <-- exclude boundary points, as we will not clip using these vertices
|
|
// -w < z < w <-- assume that Z-precision is not very important here
|
|
// 0 < w
|
|
// TODO: epsilon for numerical robustness?
|
|
|
|
bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;
|
|
|
|
if (inside)
|
|
{
|
|
float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
|
|
float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
|
|
|
|
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
|
|
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
|
|
// (3) Cull the faces.
|
|
{
|
|
const uint cullFaceMask = cullClipFaceMask;
|
|
const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
|
|
|
|
for (i = 0; i < FACES_PER_THREAD; i++)
|
|
{
|
|
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
|
|
|
|
if (n < numFacesToCull)
|
|
{
|
|
uint f = NthBitLow(cullFaceMask, n);
|
|
|
|
if (TryCullFace(f, baseVertexOffset))
|
|
{
|
|
cullClipFaceMask ^= 1 << f; // Clear the bit
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
|
|
{
|
|
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
|
|
uint orMask = 0; // Plays no role
|
|
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
|
|
|
|
cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
|
|
}
|
|
#else
|
|
InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
|
|
#endif
|
|
|
|
// (4) Clip the faces.
|
|
{
|
|
const uint clipFaceMask = cullClipFaceMask;
|
|
const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
|
|
|
|
for (i = 0; i < FACES_PER_THREAD; i++)
|
|
{
|
|
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
|
|
|
|
if (n < numFacesToClip)
|
|
{
|
|
uint f = NthBitLow(clipFaceMask, n);
|
|
|
|
uint srcBegin, srcSize;
|
|
ClipFaceAgainstViewVolume(f, baseVertexOffset,
|
|
srcBegin, srcSize, t);
|
|
UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
|
|
ndcAaBbMinPt, ndcAaBbMaxPt);
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
|
|
{
|
|
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
|
|
uint orMask = 0; // Plays no role
|
|
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
|
|
|
|
ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask));
|
|
ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask));
|
|
ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));
|
|
ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask));
|
|
ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask));
|
|
ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask));
|
|
ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask));
|
|
ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask));
|
|
}
|
|
#else
|
|
// Integer comparison works for floating-point numbers as long as the sign bit is 0.
|
|
// We must take care of -0 ourselves. saturate() does not help.
|
|
InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
|
|
InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
|
|
InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
|
|
InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
|
|
InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
|
|
InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
|
|
InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
|
|
InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]);
|
|
ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]);
|
|
ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]);
|
|
ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]);
|
|
ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]);
|
|
ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]);
|
|
ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]);
|
|
ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]);
|
|
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
|
|
|
|
// (5) Compute the AABB of the bounding sphere.
|
|
if (radius > 0)
|
|
{
|
|
// Occasionally, an intersection of AABBs of a bounding sphere and a bounding frustum
|
|
// results in a tighter AABB when compared to using the AABB of the frustum alone.
|
|
// That is the case (mostly) for sphere-capped spot lights with very wide angles.
|
|
// Note that, unfortunately, it is not quite as tight as an AABB of a CSG intersection
|
|
// of a sphere and frustum. Also note that the algorithm below doesn't clip the bounding
|
|
// sphere against the view frustum before computing the bounding box, simply because it is
|
|
// too hard/expensive. I will leave it as a TODO in case someone wants to tackle this problem.
|
|
if ((rbpC.z + radius) > 0) // Is the sphere at least *partially* in front of the origin?
|
|
{
|
|
ndcAaBbMinPt.w = max(ndcAaBbMinPt.w, rbpC.z - radius);
|
|
ndcAaBbMaxPt.w = min(ndcAaBbMaxPt.w, rbpC.z + radius);
|
|
// Computing the 'z' component for an arbitrary projection matrix is hard, so we don't do it.
|
|
// See sec. 8.2.2 of https://foundationsofgameenginedev.com/#fged2 for a solution.
|
|
|
|
float2 rectMin, rectMax;
|
|
|
|
// For the 'x' and 'y' components, the solution is given below.
|
|
if (g_isOrthographic)
|
|
{
|
|
// Compute the center and the extents (half-diagonal) of the bounding box.
|
|
float2 center = mul(projMat, float4(rbpC.xyz, 1)).xy;
|
|
float2 extents = mul(projMat, float4(radius.xx, 0, 0)).xy;
|
|
|
|
rectMin = center - extents;
|
|
rectMax = center + extents;
|
|
}
|
|
else // Perspective
|
|
{
|
|
float2 xBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.xxz, radius, projMat._m00, projMat._m02); // X-Z plane
|
|
float2 yBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.yyz, radius, projMat._m11, projMat._m12); // Y-Z plane
|
|
|
|
rectMin = float2(xBounds.r, yBounds.r);
|
|
rectMax = float2(xBounds.g, yBounds.g);
|
|
}
|
|
|
|
// Transform to the NDC coordinates.
|
|
rectMin = rectMin * 0.5 + 0.5;
|
|
rectMax = rectMax * 0.5 + 0.5;
|
|
|
|
// Note: separating the X- and Y-computations across 2 threads is not worth it.
|
|
ndcAaBbMinPt.xy = max(ndcAaBbMinPt.xy, rectMin);
|
|
ndcAaBbMaxPt.xy = min(ndcAaBbMaxPt.xy, rectMax);
|
|
}
|
|
}
|
|
|
|
if ((globalLightIndex < (uint)g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts
|
|
{
|
|
// For stereo, we have two sets of lights. Therefore, each eye has a set of mins
|
|
// followed by a set of maxs, and each set is equal to g_iNrVisibLights.
|
|
const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex);
|
|
|
|
g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt;
|
|
g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt;
|
|
}
|
|
}
|