2021-09-09 20:42:29 -04:00

907 lines
39 KiB
Plaintext

// #pragma enable_d3d11_debug_symbols
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
#pragma kernel main
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"
/* ------------------------------ Inputs ------------------------------------ */
StructuredBuffer<SFiniteLightBound> g_data : register(t0);
/* ------------------------------ Outputs ----------------------------------- */
RWStructuredBuffer<float4> g_vBoundsBuffer : register(u0);
/* ------------------------------ Utilities --------------------------------- */
// Returns the location of the N-th set bit starting from the lowest order bit and working upward.
// Slow implementation - do not use for large bit sets.
// Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html
uint NthBitLow(uint value, uint n)
{
uint b = -1; // Consistent with the behavior of firstbitlow()
uint c = countbits(value);
if (n < c) // Validate inputs
{
uint r = n + 1; // Compute the number of remaining bits
do
{
uint f = firstbitlow(value >> (b + 1)); // Find the next set bit
b += f + r; // Make a guess (assume all [b+f+1,b+f+r] bits are set)
c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set
r = (n + 1) - c; // Compute the number of remaining bits
} while (r > 0);
}
return b;
}
float4x4 Translation4x4(float3 d)
{
float4x4 M = k_identity4x4;
M._14_24_34 = d; // Last column
return M;
}
// Scale followed by rotation (scaled axes).
float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
{
float3x3 R = float3x3(xAxis, yAxis, zAxis);
float3x3 C = transpose(R); // Row to column
return C;
}
float3x3 Invert3x3(float3x3 R)
{
float3x3 C = transpose(R); // Row to column
float det = dot(C[0], cross(C[1], C[2]));
float3x3 adj = float3x3(cross(C[1], C[2]),
cross(C[2], C[0]),
cross(C[0], C[1]));
return rcp(det) * adj;
}
float4x4 Homogenize3x3(float3x3 R)
{
float4x4 M = float4x4(float4(R[0], 0),
float4(R[1], 0),
float4(R[2], 0),
float4(0,0,0,1));
return M;
}
float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
{
float b = (f + n) * rcp(f - n);
float c = -2 * f * n * rcp(f - n);
return float4x4(g/a, 0, 0, 0,
0, g, 0, 0, // No Y-flip
0, 0, b, c, // Z in [-1, 1], no Z-reversal
0, 0, 1, 0); // No W-flip
}
/* ------------------------------ Implementation ---------------------------- */
// !!! IMPORTANT !!!
// The legacy code from Morten provides us special projection matrices (and their inverses).
// These matrices are different from the matrices the HDRP uses.
// There is no reversed-Z buffering (effectively, forced UNITY_REVERSED_Z = 0).
// Additionally, there is no clip-space flip (effectively, forced UNITY_UV_STARTS_AT_TOP = 0).
// Therefore, all coordinate systems are left-handed, Y-up, without W-flip.
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
// y z
// | /
// 0 -- x
// Improve the quality of generated code at the expense of readability.
// Remove when the shader compiler is clever enough to perform this optimization for us.
#define OBTUSE_COMPILER
#ifdef SHADER_API_XBOXONE
// The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant.
// In our case, the mask is a compile-time constant, but it is defined inside a loop
// that is unrolled at the compile time, and the constants are generated during the
// constant propagation pass of the optimizer. This works fine on PlayStation, but does not work
// on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support
// of wave intrinsics on Xbox until the Xbox compiler is fixed.
#undef PLATFORM_SUPPORTS_WAVE_INTRINSICS
#endif
#define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX)
#define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks
// Clipping a plane by a cube may produce a hexagon (6-gon).
// Clipping a hexagon by 4 planes may produce a decagon (10-gon).
#define MAX_CLIP_VERTS (10)
#define NUM_VERTS (8)
#define NUM_FACES (6)
#define NUM_PLANES (6)
#define THREADS_PER_GROUP (64)
#define THREADS_PER_LIGHT (4) // Set to 1 for debugging
#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT)
#define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP)
#define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT)
#define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
// All planes and faces are always in the standard order (see below).
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
#define FACE_LEFT (1 << 0) // -X
#define FACE_RIGHT (1 << 1) // +X
#define FACE_BOTTOM (1 << 2) // -Y
#define FACE_TOP (1 << 3) // +Y
#define FACE_FRONT (1 << 4) // -Z
#define FACE_BACK (1 << 5) // +Z
#define FACE_MASK ((1 << NUM_FACES) - 1)
// A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB).
#define VERT_LIST_LEFT ((4) << 9 | (6) << 6 | (2) << 3 | (0) << 0)
#define VERT_LIST_RIGHT ((3) << 9 | (7) << 6 | (5) << 3 | (1) << 0)
#define VERT_LIST_BOTTOM ((1) << 9 | (5) << 6 | (4) << 3 | (0) << 0)
#define VERT_LIST_TOP ((6) << 9 | (7) << 6 | (3) << 3 | (2) << 0)
#define VERT_LIST_FRONT ((2) << 9 | (3) << 6 | (1) << 3 | (0) << 0)
#define VERT_LIST_BACK ((5) << 9 | (7) << 6 | (6) << 3 | (4) << 0)
// All vertices are always in the standard order (see below).
uint GetFaceMaskOfVertex(uint v)
{
// 0: (-1, -1, -1) -> { FACE_LEFT | FACE_BOTTOM | FACE_FRONT }
// 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT }
// 2: (-1, +1, -1) -> { FACE_LEFT | FACE_TOP | FACE_FRONT }
// 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_TOP | FACE_FRONT }
// 4: (-1, -1, +1) -> { FACE_LEFT | FACE_BOTTOM | FACE_BACK }
// 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK }
// 6: (-1, +1, +1) -> { FACE_LEFT | FACE_TOP | FACE_BACK }
// 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_TOP | FACE_BACK }
// ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32)
uint f = (FACE_LEFT << BitFieldExtract(v, 0, 1))
| (FACE_BOTTOM << BitFieldExtract(v, 1, 1))
| (FACE_FRONT << BitFieldExtract(v, 2, 1));
return f;
};
float3 GenerateVertexOfStandardCube(uint v)
{
float3 p;
p.x = ((v & 1) == 0) ? -1 : 1; // FACE_LEFT : FACE_RIGHT
p.y = ((v & 2) == 0) ? -1 : 1; // FACE_BOTTOM : FACE_TOP
p.z = ((v & 4) == 0) ? -1 : 1; // FACE_FRONT : FACE_BACK
return p;
}
uint GetVertexListOfFace(uint f)
{
// Warning: don't add 'static' here unless you want really bad code gen.
const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT,
(VERT_LIST_TOP << 12) | VERT_LIST_BOTTOM,
(VERT_LIST_BACK << 12) | VERT_LIST_FRONT);
return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
}
// 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
groupshared float gs_HapVertsX[VERTS_PER_GROUP];
groupshared float gs_HapVertsY[VERTS_PER_GROUP];
groupshared float gs_HapVertsZ[VERTS_PER_GROUP];
groupshared float gs_HapVertsW[VERTS_PER_GROUP];
groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types)
#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
// 1 array * 16 elements * 4 bytes each = 64 bytes.
groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types)
// 8 arrays * 16 elements * 4 bytes each = 512 bytes.
// These are actually floats reinterpreted as uints.
// The reason is because floating-point atomic operations are not supported.
groupshared uint gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP];
groupshared uint gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP];
groupshared uint gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP];
groupshared uint gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP];
groupshared uint gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed
groupshared uint gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique.
groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling
groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];
float4 GetFromRingBuffer(uint threadIdx, uint entry)
{
float4 outV;
outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
return outV;
}
void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
{
gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
}
/////////////////////////////////////////////////////////
// Returns 'true' if it manages to cull the face.
bool TryCullFace(uint f, uint baseOffsetVertex)
{
uint cullMaskOfFace = FACE_MASK; // Initially behind
uint vertListOfFace = GetVertexListOfFace(f);
for (uint j = 0; j < 4; j++)
{
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
// Non-zero if ALL the vertices are behind any of the planes.
cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
}
return (cullMaskOfFace != 0);
}
struct ClipVertex
{
float4 pt; // Homogeneous coordinate after perspective
float bc; // Boundary coordinate with respect to the plane 'p'
};
ClipVertex CreateClipVertex(uint p, float4 v)
{
bool evenPlane = (p & 1) == 0;
float c = v[p >> 1];
float w = v.w;
ClipVertex cv;
cv.pt = v;
cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex);
return cv;
}
float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
{
float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1
return lerp(v0.pt, v1.pt, alpha);
}
void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
uint threadIdx,
out uint dstBegin, out uint dstSize)
{
dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
dstSize = 0;
ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));
#ifdef OBTUSE_COMPILER
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
#endif
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
{
#ifndef OBTUSE_COMPILER
uint modSrcIdx = j % MAX_CLIP_VERTS;
#endif
ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx));
// Execute Blinn's line clipping algorithm.
// Classify the line segment. 4 cases:
// 0. v0 out, v1 out -> add nothing
// 1. v0 in, v1 out -> add intersection
// 2. v0 out, v1 in -> add intersection, add v1
// 3. v0 in, v1 in -> add v1
// (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0.
if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
{
// The line segment is guaranteed to cross the plane.
float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert);
#ifndef OBTUSE_COMPILER
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
#endif
WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
#ifdef OBTUSE_COMPILER
dstSize++;
modDstIdx++;
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
#endif
}
if (leadVert.bc >= 0)
{
#ifndef OBTUSE_COMPILER
uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
#endif
WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);
//vertRingBuffer[modDstIdx] = leadVert.pt;
#ifdef OBTUSE_COMPILER
dstSize++;
modDstIdx++;
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
#endif
}
#ifdef OBTUSE_COMPILER
modSrcIdx++;
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
#endif
tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer
}
}
void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
out uint srcBegin, out uint srcSize,
uint threadIdx)
{
srcBegin = 0;
srcSize = 4;
uint clipMaskOfFace = 0; // Initially in front
uint vertListOfFace = GetVertexListOfFace(f);
for (uint j = 0; j < 4; j++)
{
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
// Non-zero if ANY of the vertices are behind any of the planes.
clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];
// Not all edges may require clipping. However, filtering the vertex list
// is somewhat expensive, so we currently don't do it.
WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
//vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
//vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
//vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
//vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
}
// Sutherland-Hodgeman polygon clipping algorithm.
// It works by clipping the entire polygon against one clipping plane at a time.
while (clipMaskOfFace != 0)
{
uint p = firstbitlow(clipMaskOfFace);
uint dstBegin, dstSize;
ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);
srcBegin = dstBegin;
srcSize = dstSize;
clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
}
}
void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
bool isOrthoProj, float4x4 invProjMat,
inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
{
#ifdef OBTUSE_COMPILER
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
#endif
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
{
#ifndef OBTUSE_COMPILER
uint modSrcIdx = j % MAX_CLIP_VERTS;
#endif
float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
// Clamp to the bounds in case of numerical errors (may still generate -0).
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
float rbpVertVSz = hapVert.w;
if (isOrthoProj) // Must replace (w = 1)
{
rbpVertVSz = dot(invProjMat[2], hapVert);
}
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
#ifdef OBTUSE_COMPILER
modSrcIdx++;
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
#endif
}
}
// Given: 'C' is the center of the sphere in the view space, 'r' is its radius;
// 'projScale' and 'projOffset' are used to perform projection of the X (or Y) component of a vector.
float2 ComputeBoundsOfSphereOnProjectivePlane(float3 C, float r, float projScale, float projOffset)
{
float xMin, xMax;
// See sec. 8.2.1 of https://foundationsofgameenginedev.com/#fged2 for an alternative derivation.
// Goal: find the planes that pass through the origin O, bound the sphere, and form
// an axis-aligned rectangle at the intersection with the projection plane.
// Solution (for the X-coordinate):
// The intersection of the bounding planes and the projection plane must be vertical lines,
// which means that the bounding planes must be tangent to the Y-axis.
// The bounding planes must be also tangent to the sphere.
// Call the intersection points of the two vertical bounding planes and the bounding
// sphere B and D. Assume that B is on the left of C; D is on the right of C.
// Note that C may be behind the origin, so the same generally goes for B and D.
// BC is normal w.r.t. the bounding plane, so it is normal w.r.t. the Y-axis; |BC| = r.
// As a consequence, it lies in a plane parallel to the the O-X-Z plane.
// Consider B'C', which is an orthogonal projection of BC onto the actual O-X-Z plane.
// (Imagine sliding the sphere up or down between the bounding planes).
// We then consider a triangle OB'C' that lies entirely in the O-X-Z plane.
// The coordinates are: OB' = (b.x, 0, b.z), OC' = (c.x, 0, c.z).
float3 B, D;
// OBC is a right triangle. So is OB'C'.
// |BC| = |B'C'| = r.
// |OB'|^2 = |OC'|^2 - |B'C'|^2.
float lenSqOC = dot(C.xz, C.xz);
float lenSqOB = lenSqOC - r * r;
// If |OB'| = 0 or |OC'| = 0, the bounding planes tangent to the sphere do not exist.
if (lenSqOB > 0)
{
float lenOB = sqrt(lenSqOB);
// |OB' x OC'| = |OB'| * |OC'| * Sin[a'].
// OB' . OC' = |OB'| * |OC'| * Cos[a'].
// We can determine Sin[a'] = |B'C'| / |OC'| = R / |OC'|.
// Cos[a'] = Sqrt[1 - Sin[a']^2].
// (OB' x OC') points along Y.
// (OB' x OC').y = b.z * c.x - b.x * c.z.
// Therefore, b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a'].
// OB' . OC' = b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a'].
// Since we don't care about the scale, and |OB'| != 0 and |OC'| != 0,
// we can equivalently solve
// z * c.x - x * c.z = |OC'|^3 * Sin[a'].
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
// With 2 equations and 2 unknowns, we can easily solve this linear system.
// The solutions is
// x = -c.z * r + c.x * |OB'|.
// z = c.x * r + c.z * |OB'|.
B.x = C.x * lenOB - (C.z * r);
B.z = C.z * lenOB + (C.x * r);
// (OD' x OC') points along Y.
// (OD' x OC').y = d.z * c.x - d.x * c.z.
// We must solve
// z * c.x - x * c.z = -|OC'|^3 * Sin[a'].
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
// The solution is
// x = c.z * r + c.x * |OB'|.
// z = -c.x * r + c.z * |OB'|.
D.x = C.x * lenOB + (C.z * r);
D.z = C.z * lenOB - (C.x * r);
// We can transform OB and OD as direction vectors.
// For the simplification below, see OptimizeProjectionMatrix.
float rapBx = (B.x * rcp(B.z)) * projScale + projOffset;
float rapDx = (D.x * rcp(D.z)) * projScale + projOffset;
// One problem with the above is that this direction may, for certain spheres,
// point behind the origin (B.z <= 0 or D.z <= 0).
// At this point we know that the sphere at least *partially* in front of the origin,
// and that it is we are not inside the sphere, so there is at least one valid
// plane (and one valid direction). We just need the second direction to go "in front"
// of the first one to extend the bounding box.
xMin = (B.z > 0) ? rapBx : -FLT_INF;
xMax = (D.z > 0) ? rapDx : FLT_INF;
}
else
{
// Conservative estimate (we do not cull the bounding sphere using the view frustum).
xMin = -1;
xMax = 1;
}
return float2(xMin, xMax);
}
//**********************************************************************************************
// The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
// The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
//
// Since a light volume may be partially off-screen, we must clip it before computing the AABB.
// Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
//
// To avoid having to deal with the "Moebius twist" property of the perspective transform,
// we perform clipping using the homogeneous (projective) post-perspective coordinates.
// This clipping method in described in Blinn's paper titled "Line Clipping".
//
// The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
// worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
// Some faces may require culling rather than clipping (the former is simpler).
//
// It's important to realize that face culling may end up culling 5 (or even all 6) faces.
// This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
// (Imagine a view volume completely or partially inside a light volume).
// Therefore, we must perform view-volume-corner-inside-light-volume tests.
//
//
// Notation:
// rbp - real (3D) coordinates before perspective
// hbp - hom. (4D) coordinates before perspective
// hap - hom. (4D) coordinates after perspective
// rap - real (3D) coordinates after perspective (after division by w)
// *********************************************************************************************
[numthreads(THREADS_PER_GROUP, 1, 1)]
void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
{
const uint t = threadID;
const uint g = groupID.x;
const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1
const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
const uint baseVertexOffset = intraGroupLightIndex * NUM_VERTS;
const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset];
const float4x4 projMat = g_mProjectionArr[eyeIndex];
const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];
// Bounding frustum.
const float3 rbpC = cullData.center.xyz; // View-space
const float3 rbpX = cullData.boxAxisX.xyz; // Pre-scaled
const float3 rbpY = cullData.boxAxisY.xyz; // Pre-scaled
const float3 rbpZ = cullData.boxAxisZ.xyz; // Pre-scaled
const float scale = cullData.scaleXY; // scale.x = scale.y
// Bounding sphere.
const float radius = cullData.radius;
#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
// (0) Initialize the TGSM.
if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
{
gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside
gs_NdcAaBbMinPtX[intraGroupLightIndex] = asuint(1.0f);
gs_NdcAaBbMaxPtX[intraGroupLightIndex] = asuint(0.0f);
gs_NdcAaBbMinPtY[intraGroupLightIndex] = asuint(1.0f);
gs_NdcAaBbMaxPtY[intraGroupLightIndex] = asuint(0.0f);
gs_NdcAaBbMinPtZ[intraGroupLightIndex] = asuint(1.0f);
gs_NdcAaBbMaxPtZ[intraGroupLightIndex] = asuint(0.0f);
gs_NdcAaBbMinPtW[intraGroupLightIndex] = asuint(FLT_INF);
gs_NdcAaBbMaxPtW[intraGroupLightIndex] = asuint(0.0f);
}
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF);
float4 ndcAaBbMaxPt = 0;
// We must determine whether we have to clip or cull any of the faces.
// If all vertices of a face are inside with respect to all the culling planes,
// we can trivially accept that face. If all vertices of a face are behind
// any single plane, we can trivially reject (cull) that face.
uint cullClipFaceMask = 0; // Initially inside
uint i; // Avoid multiply-declared variable warning
// (1) Compute the vertices of the light volume.
for (i = 0; i < VERTS_PER_THREAD; i++)
{
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
// rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1)
// rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1)
// rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1)
// rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1)
// rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; (-1, -1, +1)
// rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; (+1, -1, +1)
// rbpVerts[6] = rbpC - rbpX + rbpY + rbpZ; (-1, +1, +1)
// rbpVerts[7] = rbpC + rbpX + rbpY + rbpZ; (+1, +1, +1)
float3 m = GenerateVertexOfStandardCube(v);
m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale]
float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
// Avoid generating (w = 0).
rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;
float4 hapVert = mul(projMat, float4(rbpVertVS, 1));
// Warning: the W component may be negative.
// Flipping the -W pyramid by negating all coordinates is incorrect
// and will break both classification and clipping.
// For the orthographic projection, (w = 1).
// Transform the X and Y components: [-w, w] -> [0, w].
hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
// For each vertex, we must determine whether it is within the bounds.
// For culling and clipping, we must know, per culling plane, whether the vertex
// is in the positive or the negative half-space.
uint behindMask = 0; // Initially in front
// Consider the vertex to be inside the view volume if:
// 0 <= x <= w
// 0 <= y <= w <-- include boundary points to avoid clipping them later
// 0 <= z <= w
// w is always valid
// TODO: epsilon for numerical robustness?
for (uint j = 0; j < (NUM_PLANES / 2); j++)
{
float w = hapVert.w;
behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
}
if (behindMask == 0) // Inside?
{
// Clamp to the bounds in case of numerical errors (may still generate -0).
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z));
}
else // Outside
{
// Mark all the faces of the bounding frustum associated with this vertex.
cullClipFaceMask |= GetFaceMaskOfVertex(v);
}
gs_HapVertsX[baseVertexOffset + v] = hapVert.x;
gs_HapVertsY[baseVertexOffset + v] = hapVert.y;
gs_HapVertsZ[baseVertexOffset + v] = hapVert.z;
gs_HapVertsW[baseVertexOffset + v] = hapVert.w;
gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask;
}
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
{
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
uint orMask = 0; // Plays no role
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
}
#else
InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
GroupMemoryBarrierWithGroupSync();
cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
#endif
// (2) Test the corners of the view volume.
if (cullClipFaceMask != 0)
{
// The light is partially outside the view volume.
// Therefore, some of the corners of the view volume may be inside the light volume.
// We perform aggressive culling, so we must make sure they are accounted for.
// The light volume is a special type of cuboid - a right frustum.
// We can exploit this fact by building a light-space projection matrix.
// P_v = T * (R * S) * P_l
// P_l = (R * S)^{-1} * T^{-1} * P_v
float4x4 invTranslateToLightSpace = Translation4x4(-rbpC);
float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ)));
// TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
// This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);
if (scale != 1) // Perspective light space?
{
// Compute the parameters of the perspective projection.
float s = scale;
float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye
float n = -e - 1; // Distance from the eye to the near plane
float f = -e + 1; // Distance from the eye to the far plane
float g = f; // Distance from the eye to the projection plane
float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);
lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
}
for (i = 0; i < VERTS_PER_THREAD; i++)
{
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
float3 rapVertCS = GenerateVertexOfStandardCube(v);
rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]
float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space
float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space
// Consider the vertex to be inside the light volume if:
// -w < x < w
// -w < y < w <-- exclude boundary points, as we will not clip using these vertices
// -w < z < w <-- assume that Z-precision is not very important here
// 0 < w
// TODO: epsilon for numerical robustness?
bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;
if (inside)
{
float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
}
}
}
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
GroupMemoryBarrierWithGroupSync();
#endif
// (3) Cull the faces.
{
const uint cullFaceMask = cullClipFaceMask;
const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
for (i = 0; i < FACES_PER_THREAD; i++)
{
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
if (n < numFacesToCull)
{
uint f = NthBitLow(cullFaceMask, n);
if (TryCullFace(f, baseVertexOffset))
{
cullClipFaceMask ^= 1 << f; // Clear the bit
}
}
}
}
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
{
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
uint orMask = 0; // Plays no role
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
}
#else
InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
GroupMemoryBarrierWithGroupSync();
cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
#endif
// (4) Clip the faces.
{
const uint clipFaceMask = cullClipFaceMask;
const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
for (i = 0; i < FACES_PER_THREAD; i++)
{
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
if (n < numFacesToClip)
{
uint f = NthBitLow(clipFaceMask, n);
uint srcBegin, srcSize;
ClipFaceAgainstViewVolume(f, baseVertexOffset,
srcBegin, srcSize, t);
UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
ndcAaBbMinPt, ndcAaBbMaxPt);
}
}
}
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
{
uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
uint orMask = 0; // Plays no role
uint xorMask = 1 << i; // Flip bits one by one starting from the LSB
ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask));
ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask));
ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));
ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask));
ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask));
ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask));
ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask));
ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask));
}
#else
// Integer comparison works for floating-point numbers as long as the sign bit is 0.
// We must take care of -0 ourselves. saturate() does not help.
InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));
GroupMemoryBarrierWithGroupSync();
ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]);
ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]);
ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]);
ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]);
ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]);
ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]);
ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]);
ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]);
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS
// (5) Compute the AABB of the bounding sphere.
if (radius > 0)
{
// Occasionally, an intersection of AABBs of a bounding sphere and a bounding frustum
// results in a tighter AABB when compared to using the AABB of the frustum alone.
// That is the case (mostly) for sphere-capped spot lights with very wide angles.
// Note that, unfortunately, it is not quite as tight as an AABB of a CSG intersection
// of a sphere and frustum. Also note that the algorithm below doesn't clip the bounding
// sphere against the view frustum before computing the bounding box, simply because it is
// too hard/expensive. I will leave it as a TODO in case someone wants to tackle this problem.
if ((rbpC.z + radius) > 0) // Is the sphere at least *partially* in front of the origin?
{
ndcAaBbMinPt.w = max(ndcAaBbMinPt.w, rbpC.z - radius);
ndcAaBbMaxPt.w = min(ndcAaBbMaxPt.w, rbpC.z + radius);
// Computing the 'z' component for an arbitrary projection matrix is hard, so we don't do it.
// See sec. 8.2.2 of https://foundationsofgameenginedev.com/#fged2 for a solution.
float2 rectMin, rectMax;
// For the 'x' and 'y' components, the solution is given below.
if (g_isOrthographic)
{
// Compute the center and the extents (half-diagonal) of the bounding box.
float2 center = mul(projMat, float4(rbpC.xyz, 1)).xy;
float2 extents = mul(projMat, float4(radius.xx, 0, 0)).xy;
rectMin = center - extents;
rectMax = center + extents;
}
else // Perspective
{
float2 xBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.xxz, radius, projMat._m00, projMat._m02); // X-Z plane
float2 yBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.yyz, radius, projMat._m11, projMat._m12); // Y-Z plane
rectMin = float2(xBounds.r, yBounds.r);
rectMax = float2(xBounds.g, yBounds.g);
}
// Transform to the NDC coordinates.
rectMin = rectMin * 0.5 + 0.5;
rectMax = rectMax * 0.5 + 0.5;
// Note: separating the X- and Y-computations across 2 threads is not worth it.
ndcAaBbMinPt.xy = max(ndcAaBbMinPt.xy, rectMin);
ndcAaBbMaxPt.xy = min(ndcAaBbMaxPt.xy, rectMax);
}
}
if ((globalLightIndex < (uint)g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts
{
// For stereo, we have two sets of lights. Therefore, each eye has a set of mins
// followed by a set of maxs, and each set is equal to g_iNrVisibLights.
const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex);
g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt;
g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt;
}
}