Arcade-Maniac/Source/Library/PackageCache/com.unity.render-pipelines.high-definition@11.0.0/Runtime/Lighting/LightLoop/scrbound.compute

// #pragma enable_d3d11_debug_symbols
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch

#pragma kernel main

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl"

/* ------------------------------ Inputs ------------------------------------ */

StructuredBuffer<SFiniteLightBound> g_data : register(t0);

/* ------------------------------ Outputs ----------------------------------- */

RWStructuredBuffer<float4> g_vBoundsBuffer : register(u0);

/* ------------------------------ Utilities --------------------------------- */

// Returns the location of the N-th set bit starting from the lowest order bit and working upward.
// Slow implementation - do not use for large bit sets.
// Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html
uint NthBitLow(uint value, uint n)
{
    uint b = -1;                                    // Consistent with the behavior of firstbitlow()
    uint c = countbits(value);

    if (n < c)                                      // Validate inputs
    {
        uint r = n + 1;                             // Compute the number of remaining bits

        do
        {
            uint f = firstbitlow(value >> (b + 1)); // Find the next set bit
            b += f + r;                             // Make a guess (assume all [b+f+1,b+f+r] bits are set)
            c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set
            r = (n + 1) - c;                        // Compute the number of remaining bits
        } while (r > 0);
    }

    return b;
}

float4x4 Translation4x4(float3 d)
{
    float4x4 M = k_identity4x4;

    M._14_24_34 = d; // Last column

    return M;
}

// Scale followed by rotation (scaled axes).
float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis)
{
    float3x3 R = float3x3(xAxis, yAxis, zAxis);
    float3x3 C = transpose(R); // Row to column

    return C;
}

float3x3 Invert3x3(float3x3 R)
{
    float3x3 C   = transpose(R); // Row to column
    float    det = dot(C[0], cross(C[1], C[2]));
    float3x3 adj = float3x3(cross(C[1], C[2]),
                            cross(C[2], C[0]),
                            cross(C[0], C[1]));
    return rcp(det) * adj;
}

float4x4 Homogenize3x3(float3x3 R)
{
    float4x4 M = float4x4(float4(R[0], 0),
                          float4(R[1], 0),
                          float4(R[2], 0),
                          float4(0,0,0,1));
    return M;
}

float4x4 PerspectiveProjection4x4(float a, float g, float n, float f)
{
    float b = (f + n) * rcp(f - n);
    float c = -2 * f * n * rcp(f - n);

    return float4x4(g/a, 0, 0, 0,
                      0, g, 0, 0,  // No Y-flip
                      0, 0, b, c,  // Z in [-1, 1], no Z-reversal
                      0, 0, 1, 0); // No W-flip
}

/* ------------------------------ Implementation ---------------------------- */

// !!! IMPORTANT !!!
// The legacy code from Morten provides us special projection matrices (and their inverses).
// These matrices are different from the matrices the HDRP uses.
// There is no reversed-Z buffering (effectively, forced UNITY_REVERSED_Z = 0).
// Additionally, there is no clip-space flip (effectively, forced UNITY_UV_STARTS_AT_TOP = 0).
// Therefore, all coordinate systems are left-handed, Y-up, without W-flip.
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
// y  z
// | /
// 0 -- x

// Improve the quality of generated code at the expense of readability.
// Remove when the shader compiler is clever enough to perform this optimization for us.
#define OBTUSE_COMPILER

#ifdef SHADER_API_XBOXONE
// The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant.
// In our case, the mask is a compile-time constant, but it is defined inside a loop
// that is unrolled at the compile time, and the constants are generated during the
// constant propagation pass of the optimizer. This works fine on PlayStation, but does not work
// on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support
// of wave intrinsics on Xbox until the Xbox compiler is fixed.
#undef PLATFORM_SUPPORTS_WAVE_INTRINSICS
#endif


#define CLEAR_SIGN_BIT(X)  (asint(X) & INT_MAX)
#define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks

// Clipping a plane by a cube may produce a hexagon (6-gon).
// Clipping a hexagon by 4 planes may produce a decagon (10-gon).
#define MAX_CLIP_VERTS    (10)
#define NUM_VERTS         (8)
#define NUM_FACES         (6)
#define NUM_PLANES        (6)
#define THREADS_PER_GROUP (64)
#define THREADS_PER_LIGHT (4) // Set to 1 for debugging
#define LIGHTS_PER_GROUP  (THREADS_PER_GROUP / THREADS_PER_LIGHT)
#define VERTS_PER_GROUP   (NUM_VERTS * LIGHTS_PER_GROUP)
#define VERTS_PER_THREAD  (NUM_VERTS / THREADS_PER_LIGHT)
#define FACES_PER_THREAD  DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)

// All planes and faces are always in the standard order (see below).
// Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm.
#define FACE_LEFT   (1 << 0) // -X
#define FACE_RIGHT  (1 << 1) // +X
#define FACE_BOTTOM (1 << 2) // -Y
#define FACE_TOP    (1 << 3) // +Y
#define FACE_FRONT  (1 << 4) // -Z
#define FACE_BACK   (1 << 5) // +Z
#define FACE_MASK   ((1 << NUM_FACES) - 1)

// A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB).
#define VERT_LIST_LEFT   ((4) << 9 | (6) << 6 | (2) << 3 | (0) << 0)
#define VERT_LIST_RIGHT  ((3) << 9 | (7) << 6 | (5) << 3 | (1) << 0)
#define VERT_LIST_BOTTOM ((1) << 9 | (5) << 6 | (4) << 3 | (0) << 0)
#define VERT_LIST_TOP    ((6) << 9 | (7) << 6 | (3) << 3 | (2) << 0)
#define VERT_LIST_FRONT  ((2) << 9 | (3) << 6 | (1) << 3 | (0) << 0)
#define VERT_LIST_BACK   ((5) << 9 | (7) << 6 | (6) << 3 | (4) << 0)

// All vertices are always in the standard order (see below).
uint GetFaceMaskOfVertex(uint v)
{
    // 0: (-1, -1, -1) -> { FACE_LEFT  | FACE_BOTTOM | FACE_FRONT }
    // 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT }
    // 2: (-1, +1, -1) -> { FACE_LEFT  | FACE_TOP    | FACE_FRONT }
    // 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_TOP    | FACE_FRONT }
    // 4: (-1, -1, +1) -> { FACE_LEFT  | FACE_BOTTOM | FACE_BACK  }
    // 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK  }
    // 6: (-1, +1, +1) -> { FACE_LEFT  | FACE_TOP    | FACE_BACK  }
    // 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_TOP    | FACE_BACK  }
    // ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32)
    uint f = (FACE_LEFT   << BitFieldExtract(v, 0, 1))
           | (FACE_BOTTOM << BitFieldExtract(v, 1, 1))
           | (FACE_FRONT  << BitFieldExtract(v, 2, 1));

    return f;
};

float3 GenerateVertexOfStandardCube(uint v)
{
    float3 p;

    p.x = ((v & 1) == 0) ? -1 : 1; // FACE_LEFT   : FACE_RIGHT
    p.y = ((v & 2) == 0) ? -1 : 1; // FACE_BOTTOM : FACE_TOP
    p.z = ((v & 4) == 0) ? -1 : 1; // FACE_FRONT  : FACE_BACK

    return p;
}

uint GetVertexListOfFace(uint f)
{
    // Warning: don't add 'static' here unless you want really bad code gen.
    const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT,
                                     (VERT_LIST_TOP   << 12) | VERT_LIST_BOTTOM,
                                     (VERT_LIST_BACK  << 12) | VERT_LIST_FRONT);

    return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
}

// 5 arrays * 128 elements * 4 bytes each = 2560 bytes.
groupshared float gs_HapVertsX[VERTS_PER_GROUP];
groupshared float gs_HapVertsY[VERTS_PER_GROUP];
groupshared float gs_HapVertsZ[VERTS_PER_GROUP];
groupshared float gs_HapVertsW[VERTS_PER_GROUP];
groupshared uint  gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types)

#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
// 1 array *  16 elements * 4 bytes each = 64 bytes.
groupshared uint  gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces  each (HLSL does not support small data types)

// 8 arrays * 16 elements * 4 bytes each = 512 bytes.
// These are actually floats reinterpreted as uints.
// The reason is because floating-point atomic operations are not supported.
groupshared uint  gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP];
groupshared uint  gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP];
groupshared uint  gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP];
groupshared uint  gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP];
groupshared uint  gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed
groupshared uint  gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique.
groupshared uint  gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
groupshared uint  gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS


// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling

groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];

float4 GetFromRingBuffer(uint threadIdx, uint entry)
{
    float4 outV;
    outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
    outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
    outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
    outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
    return outV;
}

void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
{
    gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
    gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
    gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
    gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
}
/////////////////////////////////////////////////////////


// Returns 'true' if it manages to cull the face.
bool TryCullFace(uint f, uint baseOffsetVertex)
{
    uint cullMaskOfFace = FACE_MASK; // Initially behind
    uint vertListOfFace = GetVertexListOfFace(f);

    for (uint j = 0; j < 4; j++)
    {
        uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
        // Non-zero if ALL the vertices are behind any of the planes.
        cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
    }

    return (cullMaskOfFace != 0);
}

struct ClipVertex
{
    float4 pt; // Homogeneous coordinate after perspective
    float  bc; // Boundary coordinate with respect to the plane 'p'
};

ClipVertex CreateClipVertex(uint p, float4 v)
{
    bool evenPlane = (p & 1) == 0;

    float c = v[p >> 1];
    float w = v.w;

    ClipVertex cv;

    cv.pt = v;
    cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex);

    return cv;
}

float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
{
    float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1

    return lerp(v0.pt, v1.pt, alpha);
}

void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
                             uint threadIdx,
                             out uint dstBegin, out uint dstSize)
{
    dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
    dstSize  = 0;

    ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));

#ifdef OBTUSE_COMPILER
    uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
    uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
#endif

    for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
    {
    #ifndef OBTUSE_COMPILER
        uint modSrcIdx = j % MAX_CLIP_VERTS;
    #endif
        ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx));

        // Execute Blinn's line clipping algorithm.
        // Classify the line segment. 4 cases:
        // 0. v0 out, v1 out -> add nothing
        // 1. v0 in,  v1 out -> add intersection
        // 2. v0 out, v1 in  -> add intersection, add v1
        // 3. v0 in,  v1 in  -> add v1
        // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0.

        if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
        {
            // The line segment is guaranteed to cross the plane.
            float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert);
        #ifndef OBTUSE_COMPILER
            uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
        #endif
            WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
        #ifdef OBTUSE_COMPILER
            dstSize++;
            modDstIdx++;
            modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
        #endif
        }

        if (leadVert.bc >= 0)
        {
        #ifndef OBTUSE_COMPILER
            uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS;
        #endif
            WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);

            //vertRingBuffer[modDstIdx] = leadVert.pt;
        #ifdef OBTUSE_COMPILER
            dstSize++;
            modDstIdx++;
            modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
        #endif
        }

    #ifdef OBTUSE_COMPILER
        modSrcIdx++;
        modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
    #endif
        tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer
    }
}

void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
                               out uint srcBegin, out uint srcSize,
                               uint threadIdx)
{
    srcBegin = 0;
    srcSize  = 4;

    uint clipMaskOfFace = 0; // Initially in front
    uint vertListOfFace = GetVertexListOfFace(f);

    for (uint j = 0; j < 4; j++)
    {
        uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
        // Non-zero if ANY of the vertices are behind any of the planes.
        clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];

        // Not all edges may require clipping. However, filtering the vertex list
        // is somewhat expensive, so we currently don't do it.
        WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
        //vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
        //vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
        //vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
        //vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
    }

    // Sutherland-Hodgeman polygon clipping algorithm.
    // It works by clipping the entire polygon against one clipping plane at a time.
    while (clipMaskOfFace != 0)
    {
        uint p = firstbitlow(clipMaskOfFace);

        uint dstBegin, dstSize;
        ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);

        srcBegin = dstBegin;
        srcSize  = dstSize;

        clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
    }
}

void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
                bool isOrthoProj, float4x4 invProjMat,
                inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
{
#ifdef OBTUSE_COMPILER
    uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
#endif
    for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
    {
    #ifndef OBTUSE_COMPILER
        uint modSrcIdx = j % MAX_CLIP_VERTS;
    #endif
        float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
        // Clamp to the bounds in case of numerical errors (may still generate -0).
        float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
        float  rbpVertVSz = hapVert.w;

        if (isOrthoProj) // Must replace (w = 1)
        {
            rbpVertVSz = dot(invProjMat[2], hapVert);
        }

        ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
        ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
    #ifdef OBTUSE_COMPILER
        modSrcIdx++;
        modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
    #endif
    }
}

// Given: 'C' is the center of the sphere in the view space, 'r' is its radius;
// 'projScale' and 'projOffset' are used to perform projection of the X (or Y) component of a vector.
float2 ComputeBoundsOfSphereOnProjectivePlane(float3 C, float r, float projScale, float projOffset)
{
    float xMin, xMax;

    // See sec. 8.2.1 of https://foundationsofgameenginedev.com/#fged2 for an alternative derivation.
    // Goal: find the planes that pass through the origin O, bound the sphere, and form
    // an axis-aligned rectangle at the intersection with the projection plane.
    // Solution (for the X-coordinate):
    // The intersection of the bounding planes and the projection plane must be vertical lines,
    // which means that the bounding planes must be tangent to the Y-axis.
    // The bounding planes must be also tangent to the sphere.
    // Call the intersection points of the two vertical bounding planes and the bounding
    // sphere B and D. Assume that B is on the left of C; D is on the right of C.
    // Note that C may be behind the origin, so the same generally goes for B and D.
    // BC is normal w.r.t. the bounding plane, so it is normal w.r.t. the Y-axis; |BC| = r.
    // As a consequence, it lies in a plane parallel to the the O-X-Z plane.
    // Consider B'C', which is an orthogonal projection of BC onto the actual O-X-Z plane.
    // (Imagine sliding the sphere up or down between the bounding planes).
    // We then consider a triangle OB'C' that lies entirely in the O-X-Z plane.
    // The coordinates are: OB' = (b.x, 0, b.z), OC' = (c.x, 0, c.z).
    float3 B, D;
    // OBC is a right triangle. So is OB'C'.
    // |BC| = |B'C'| = r.
    // |OB'|^2 = |OC'|^2 - |B'C'|^2.
    float lenSqOC = dot(C.xz, C.xz);
    float lenSqOB = lenSqOC - r * r;
    // If |OB'| = 0 or |OC'| = 0, the bounding planes tangent to the sphere do not exist.
    if (lenSqOB > 0)
    {
        float lenOB = sqrt(lenSqOB);
        // |OB' x OC'| = |OB'| * |OC'| * Sin[a'].
        //  OB' . OC'  = |OB'| * |OC'| * Cos[a'].
        // We can determine Sin[a'] = |B'C'| / |OC'| = R / |OC'|.
        // Cos[a'] = Sqrt[1 - Sin[a']^2].
        // (OB' x OC') points along Y.
        // (OB' x OC').y = b.z * c.x - b.x * c.z.
        // Therefore,  b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a'].
        // OB' . OC' = b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a'].
        // Since we don't care about the scale, and |OB'| != 0 and |OC'| != 0,
        // we can equivalently solve
        // z * c.x - x * c.z = |OC'|^3 * Sin[a'].
        // x * c.x + z * c.z = |OC'|^3 * Cos[a'].
        // With 2 equations and 2 unknowns, we can easily solve this linear system.
        // The solutions is
        // x = -c.z * r + c.x * |OB'|.
        // z =  c.x * r + c.z * |OB'|.
        B.x = C.x * lenOB - (C.z * r);
        B.z = C.z * lenOB + (C.x * r);
        // (OD' x OC') points along Y.
        // (OD' x OC').y = d.z * c.x - d.x * c.z.
        // We must solve
        // z * c.x - x * c.z = -|OC'|^3 * Sin[a'].
        // x * c.x + z * c.z =  |OC'|^3 * Cos[a'].
        // The solution is
        // x =  c.z * r + c.x * |OB'|.
        // z = -c.x * r + c.z * |OB'|.
        D.x = C.x * lenOB + (C.z * r);
        D.z = C.z * lenOB - (C.x * r);
        // We can transform OB and OD as direction vectors.
        // For the simplification below, see OptimizeProjectionMatrix.
        float rapBx = (B.x * rcp(B.z)) * projScale + projOffset;
        float rapDx = (D.x * rcp(D.z)) * projScale + projOffset;
        // One problem with the above is that this direction may, for certain spheres,
        // point behind the origin (B.z <= 0 or D.z <= 0).
        // At this point we know that the sphere at least *partially* in front of the origin,
        // and that it is we are not inside the sphere, so there is at least one valid
        // plane (and one valid direction). We just need the second direction to go "in front"
        // of the first one to extend the bounding box.
        xMin = (B.z > 0) ? rapBx : -FLT_INF;
        xMax = (D.z > 0) ? rapDx :  FLT_INF;
    }
    else
    {
        // Conservative estimate (we do not cull the bounding sphere using the view frustum).
        xMin = -1;
        xMax =  1;
    }

    return float2(xMin, xMax);
}

//**********************************************************************************************
// The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range).
// The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices.
//
// Since a light volume may be partially off-screen, we must clip it before computing the AABB.
// Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB.
//
// To avoid having to deal with the "Moebius twist" property of the perspective transform,
// we perform clipping using the homogeneous (projective) post-perspective coordinates.
// This clipping method in described in Blinn's paper titled "Line Clipping".
//
// The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the
// worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4).
// Some faces may require culling rather than clipping (the former is simpler).
//
// It's important to realize that face culling may end up culling 5 (or even all 6) faces.
// This means that the clipped light volume may be reduced to a single polygon, or nothing at all.
// (Imagine a view volume completely or partially inside a light volume).
// Therefore, we must perform view-volume-corner-inside-light-volume tests.
//
//
// Notation:
// rbp - real (3D) coordinates before perspective
// hbp - hom. (4D) coordinates before perspective
// hap - hom. (4D) coordinates after  perspective
// rap - real (3D) coordinates after  perspective (after division by w)
// *********************************************************************************************

[numthreads(THREADS_PER_GROUP, 1, 1)]
void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
{
    const uint t        = threadID;
    const uint g        = groupID.x;
    const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1

    const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
    const uint globalLightIndex     = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
    const uint baseVertexOffset     = intraGroupLightIndex * NUM_VERTS;

    const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
    const SFiniteLightBound  cullData = g_data[eyeAdjustedInputOffset];

    const float4x4 projMat    = g_mProjectionArr[eyeIndex];
    const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];

    // Bounding frustum.
    const float3 rbpC  = cullData.center.xyz;   // View-space
    const float3 rbpX  = cullData.boxAxisX.xyz; // Pre-scaled
    const float3 rbpY  = cullData.boxAxisY.xyz; // Pre-scaled
    const float3 rbpZ  = cullData.boxAxisZ.xyz; // Pre-scaled
    const float scale  = cullData.scaleXY;      // scale.x = scale.y
    // Bounding sphere.
    const float radius = cullData.radius;

#ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    // (0) Initialize the TGSM.
    if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts
    {
        gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside
        gs_NdcAaBbMinPtX[intraGroupLightIndex]     = asuint(1.0f);
        gs_NdcAaBbMaxPtX[intraGroupLightIndex]     = asuint(0.0f);
        gs_NdcAaBbMinPtY[intraGroupLightIndex]     = asuint(1.0f);
        gs_NdcAaBbMaxPtY[intraGroupLightIndex]     = asuint(0.0f);
        gs_NdcAaBbMinPtZ[intraGroupLightIndex]     = asuint(1.0f);
        gs_NdcAaBbMaxPtZ[intraGroupLightIndex]     = asuint(0.0f);
        gs_NdcAaBbMinPtW[intraGroupLightIndex]     = asuint(FLT_INF);
        gs_NdcAaBbMaxPtW[intraGroupLightIndex]     = asuint(0.0f);
    }
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS

    float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF);
    float4 ndcAaBbMaxPt = 0;

    // We must determine whether we have to clip or cull any of the faces.
    // If all vertices of a face are inside with respect to all the culling planes,
    // we can trivially accept that face. If all vertices of a face are behind
    // any single plane, we can trivially reject (cull) that face.
    uint cullClipFaceMask = 0; // Initially inside

    uint i; // Avoid multiply-declared variable warning

    // (1) Compute the vertices of the light volume.
    for (i = 0; i < VERTS_PER_THREAD; i++)
    {
        uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;

        // rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1)
        // rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1)
        // rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1)
        // rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1)
        // rbpVerts[4] = rbpC - rbpX         - rbpY         + rbpZ; (-1, -1, +1)
        // rbpVerts[5] = rbpC + rbpX         - rbpY         + rbpZ; (+1, -1, +1)
        // rbpVerts[6] = rbpC - rbpX         + rbpY         + rbpZ; (-1, +1, +1)
        // rbpVerts[7] = rbpC + rbpX         + rbpY         + rbpZ; (+1, +1, +1)

        float3 m = GenerateVertexOfStandardCube(v);
        m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale]

        float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
        // Avoid generating (w = 0).
        rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;

        float4 hapVert = mul(projMat, float4(rbpVertVS, 1));

        // Warning: the W component may be negative.
        // Flipping the -W pyramid by negating all coordinates is incorrect
        // and will break both classification and clipping.
        // For the orthographic projection, (w = 1).

        // Transform the X and Y components: [-w, w] -> [0, w].
        hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);

        // For each vertex, we must determine whether it is within the bounds.
        // For culling and clipping, we must know, per culling plane, whether the vertex
        // is in the positive or the negative half-space.
        uint behindMask = 0; // Initially in front

        // Consider the vertex to be inside the view volume if:
        // 0 <= x <= w
        // 0 <= y <= w   <-- include boundary points to avoid clipping them later
        // 0 <= z <= w
        // w is always valid
        // TODO: epsilon for numerical robustness?

        for (uint j = 0; j < (NUM_PLANES / 2); j++)
        {
            float w = hapVert.w;

            behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
            behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
        }

        if (behindMask == 0) // Inside?
        {
            // Clamp to the bounds in case of numerical errors (may still generate -0).
            float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));

            ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z));
            ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z));
        }
        else // Outside
        {
            // Mark all the faces of the bounding frustum associated with this vertex.
            cullClipFaceMask |= GetFaceMaskOfVertex(v);
        }

        gs_HapVertsX[baseVertexOffset + v]          = hapVert.x;
        gs_HapVertsY[baseVertexOffset + v]          = hapVert.y;
        gs_HapVertsZ[baseVertexOffset + v]          = hapVert.z;
        gs_HapVertsW[baseVertexOffset + v]          = hapVert.w;
        gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask;
    }

#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
    {
        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
        uint orMask  = 0;                       // Plays no role
        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB

        cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
    }
#else
    InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);

    GroupMemoryBarrierWithGroupSync();

    cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
#endif

    // (2) Test the corners of the view volume.
    if (cullClipFaceMask != 0)
    {
        // The light is partially outside the view volume.
        // Therefore, some of the corners of the view volume may be inside the light volume.
        // We perform aggressive culling, so we must make sure they are accounted for.
        // The light volume is a special type of cuboid - a right frustum.
        // We can exploit this fact by building a light-space projection matrix.
        // P_v = T * (R * S) * P_l
        // P_l = (R * S)^{-1} * T^{-1} * P_v
        float4x4 invTranslateToLightSpace      = Translation4x4(-rbpC);
        float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ)));
        // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.

        // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
        float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);

        if (scale != 1) // Perspective light space?
        {
            // Compute the parameters of the perspective projection.
            float s = scale;
            float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye
            float n = -e - 1;                    // Distance from the eye to the near plane
            float f = -e + 1;                    // Distance from the eye to the far plane
            float g = f;                         // Distance from the eye to the projection plane

            float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
            float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);

            lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
        }

        for (i = 0; i < VERTS_PER_THREAD; i++)
        {
            uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;

            float3 rapVertCS = GenerateVertexOfStandardCube(v);
            rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]

            float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space
            float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS);      // View to light space

            // Consider the vertex to be inside the light volume if:
            // -w < x < w
            // -w < y < w   <-- exclude boundary points, as we will not clip using these vertices
            // -w < z < w   <-- assume that Z-precision is not very important here
            // 0  < w
            // TODO: epsilon for numerical robustness?

            bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;

            if (inside)
            {
                float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
                float  rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);

                ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
                ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
            }
        }
    }

#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    GroupMemoryBarrierWithGroupSync();
#endif

    // (3) Cull the faces.
    {
        const uint cullFaceMask   = cullClipFaceMask;
        const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]

        for (i = 0; i < FACES_PER_THREAD; i++)
        {
            uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;

            if (n < numFacesToCull)
            {
                uint f = NthBitLow(cullFaceMask, n);

                if (TryCullFace(f, baseVertexOffset))
                {
                    cullClipFaceMask ^= 1 << f; // Clear the bit
                }
            }
        }
    }

#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
    {
        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
        uint orMask  = 0;                       // Plays no role
        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB

        cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask);
    }
#else
    InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);

    GroupMemoryBarrierWithGroupSync();

    cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
#endif

    // (4) Clip the faces.
    {
        const uint clipFaceMask   = cullClipFaceMask;
        const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]

        for (i = 0; i < FACES_PER_THREAD; i++)
        {
            uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;

            if (n < numFacesToClip)
            {
                uint f = NthBitLow(clipFaceMask, n);

                uint   srcBegin, srcSize;
                ClipFaceAgainstViewVolume(f, baseVertexOffset,
                                          srcBegin, srcSize, t);
                UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
                           ndcAaBbMinPt, ndcAaBbMaxPt);
            }
        }
    }

#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++)
    {
        uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes
        uint orMask  = 0;                       // Plays no role
        uint xorMask = 1 << i;                  // Flip bits one by one starting from the LSB

        ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask));
        ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask));
        ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask));
        ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask));
        ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask));
        ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask));
        ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask));
        ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask));
    }
#else
    // Integer comparison works for floating-point numbers as long as the sign bit is 0.
    // We must take care of -0 ourselves. saturate() does not help.
    InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x)));
    InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x)));
    InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y)));
    InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y)));
    InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z)));
    InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z)));
    InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w)));
    InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w)));

    GroupMemoryBarrierWithGroupSync();

    ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]);
    ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]);
    ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]);
    ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]);
    ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]);
    ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]);
    ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]);
    ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]);
#endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS

    // (5) Compute the AABB of the bounding sphere.
    if (radius > 0)
    {
        // Occasionally, an intersection of AABBs of a bounding sphere and a bounding frustum
        // results in a tighter AABB when compared to using the AABB of the frustum alone.
        // That is the case (mostly) for sphere-capped spot lights with very wide angles.
        // Note that, unfortunately, it is not quite as tight as an AABB of a CSG intersection
        // of a sphere and frustum. Also note that the algorithm below doesn't clip the bounding
        // sphere against the view frustum before computing the bounding box, simply because it is
        // too hard/expensive. I will leave it as a TODO in case someone wants to tackle this problem.
        if ((rbpC.z + radius) > 0) // Is the sphere at least *partially* in front of the origin?
        {
            ndcAaBbMinPt.w = max(ndcAaBbMinPt.w, rbpC.z - radius);
            ndcAaBbMaxPt.w = min(ndcAaBbMaxPt.w, rbpC.z + radius);
            // Computing the 'z' component for an arbitrary projection matrix is hard, so we don't do it.
            // See sec. 8.2.2 of https://foundationsofgameenginedev.com/#fged2 for a solution.

            float2 rectMin, rectMax;

            // For the 'x' and 'y' components, the solution is given below.
            if (g_isOrthographic)
            {
                // Compute the center and the extents (half-diagonal) of the bounding box.
                float2 center  = mul(projMat, float4(rbpC.xyz,     1)).xy;
                float2 extents = mul(projMat, float4(radius.xx, 0, 0)).xy;

                rectMin = center - extents;
                rectMax = center + extents;
            }
            else // Perspective
            {
                float2 xBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.xxz, radius, projMat._m00, projMat._m02); // X-Z plane
                float2 yBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.yyz, radius, projMat._m11, projMat._m12); // Y-Z plane

                rectMin = float2(xBounds.r, yBounds.r);
                rectMax = float2(xBounds.g, yBounds.g);
            }

            // Transform to the NDC coordinates.
            rectMin = rectMin * 0.5 + 0.5;
            rectMax = rectMax * 0.5 + 0.5;

            // Note: separating the X- and Y-computations across 2 threads is not worth it.
            ndcAaBbMinPt.xy = max(ndcAaBbMinPt.xy, rectMin);
            ndcAaBbMaxPt.xy = min(ndcAaBbMaxPt.xy, rectMax);
        }
    }

    if ((globalLightIndex < (uint)g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts
    {
        // For stereo, we have two sets of lights. Therefore, each eye has a set of mins
        // followed by a set of maxs, and each set is equal to g_iNrVisibLights.
        const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex);

        g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt;
        g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt;
    }
}