// #pragma enable_d3d11_debug_symbols #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch #pragma kernel main #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition-config/Runtime/ShaderConfig.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightCullUtils.hlsl" /* ------------------------------ Inputs ------------------------------------ */ StructuredBuffer g_data : register(t0); /* ------------------------------ Outputs ----------------------------------- */ RWStructuredBuffer g_vBoundsBuffer : register(u0); /* ------------------------------ Utilities --------------------------------- */ // Returns the location of the N-th set bit starting from the lowest order bit and working upward. // Slow implementation - do not use for large bit sets. // Could be optimized - see https://graphics.stanford.edu/~seander/bithacks.html uint NthBitLow(uint value, uint n) { uint b = -1; // Consistent with the behavior of firstbitlow() uint c = countbits(value); if (n < c) // Validate inputs { uint r = n + 1; // Compute the number of remaining bits do { uint f = firstbitlow(value >> (b + 1)); // Find the next set bit b += f + r; // Make a guess (assume all [b+f+1,b+f+r] bits are set) c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set r = (n + 1) - c; // Compute the number of remaining bits } while (r > 0); } return b; } float4x4 Translation4x4(float3 d) { float4x4 M = k_identity4x4; M._14_24_34 = d; // Last column return M; } // Scale followed by rotation (scaled axes). float3x3 ScaledRotation3x3(float3 xAxis, float3 yAxis, float3 zAxis) { float3x3 R = float3x3(xAxis, yAxis, zAxis); float3x3 C = transpose(R); // Row to column return C; } float3x3 Invert3x3(float3x3 R) { float3x3 C = transpose(R); // Row to column float det = dot(C[0], cross(C[1], C[2])); float3x3 adj = float3x3(cross(C[1], C[2]), cross(C[2], C[0]), cross(C[0], C[1])); return rcp(det) * adj; } float4x4 Homogenize3x3(float3x3 R) { float4x4 M = float4x4(float4(R[0], 0), float4(R[1], 0), float4(R[2], 0), float4(0,0,0,1)); return M; } float4x4 PerspectiveProjection4x4(float a, float g, float n, float f) { float b = (f + n) * rcp(f - n); float c = -2 * f * n * rcp(f - n); return float4x4(g/a, 0, 0, 0, 0, g, 0, 0, // No Y-flip 0, 0, b, c, // Z in [-1, 1], no Z-reversal 0, 0, 1, 0); // No W-flip } /* ------------------------------ Implementation ---------------------------- */ // !!! IMPORTANT !!! // The legacy code from Morten provides us special projection matrices (and their inverses). // These matrices are different from the matrices the HDRP uses. // There is no reversed-Z buffering (effectively, forced UNITY_REVERSED_Z = 0). // Additionally, there is no clip-space flip (effectively, forced UNITY_UV_STARTS_AT_TOP = 0). // Therefore, all coordinate systems are left-handed, Y-up, without W-flip. // Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm. // y z // | / // 0 -- x // Improve the quality of generated code at the expense of readability. // Remove when the shader compiler is clever enough to perform this optimization for us. #define OBTUSE_COMPILER #ifdef SHADER_API_XBOXONE // The Xbox shader compiler expects the lane swizzle mask to be a compile-time constant. // In our case, the mask is a compile-time constant, but it is defined inside a loop // that is unrolled at the compile time, and the constants are generated during the // constant propagation pass of the optimizer. This works fine on PlayStation, but does not work // on Xbox. In order to avoid writing hideous code specifically for Xbox, we disable the support // of wave intrinsics on Xbox until the Xbox compiler is fixed. #undef PLATFORM_SUPPORTS_WAVE_INTRINSICS #endif #define CLEAR_SIGN_BIT(X) (asint(X) & INT_MAX) #define DIV_ROUND_UP(N, D) (((N) + (D) - 1) / (D)) // No division by 0 checks // Clipping a plane by a cube may produce a hexagon (6-gon). // Clipping a hexagon by 4 planes may produce a decagon (10-gon). #define MAX_CLIP_VERTS (10) #define NUM_VERTS (8) #define NUM_FACES (6) #define NUM_PLANES (6) #define THREADS_PER_GROUP (64) #define THREADS_PER_LIGHT (4) // Set to 1 for debugging #define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT) #define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP) #define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT) #define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT) // All planes and faces are always in the standard order (see below). // Near and far planes are swapped in the case of Z-reversal, but it does not change the algorithm. #define FACE_LEFT (1 << 0) // -X #define FACE_RIGHT (1 << 1) // +X #define FACE_BOTTOM (1 << 2) // -Y #define FACE_TOP (1 << 3) // +Y #define FACE_FRONT (1 << 4) // -Z #define FACE_BACK (1 << 5) // +Z #define FACE_MASK ((1 << NUM_FACES) - 1) // A list of vertices for each face (CCW order w.r.t. its normal, starting from the LSB). #define VERT_LIST_LEFT ((4) << 9 | (6) << 6 | (2) << 3 | (0) << 0) #define VERT_LIST_RIGHT ((3) << 9 | (7) << 6 | (5) << 3 | (1) << 0) #define VERT_LIST_BOTTOM ((1) << 9 | (5) << 6 | (4) << 3 | (0) << 0) #define VERT_LIST_TOP ((6) << 9 | (7) << 6 | (3) << 3 | (2) << 0) #define VERT_LIST_FRONT ((2) << 9 | (3) << 6 | (1) << 3 | (0) << 0) #define VERT_LIST_BACK ((5) << 9 | (7) << 6 | (6) << 3 | (4) << 0) // All vertices are always in the standard order (see below). uint GetFaceMaskOfVertex(uint v) { // 0: (-1, -1, -1) -> { FACE_LEFT | FACE_BOTTOM | FACE_FRONT } // 1: (+1, -1, -1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_FRONT } // 2: (-1, +1, -1) -> { FACE_LEFT | FACE_TOP | FACE_FRONT } // 3: (+1, +1, -1) -> { FACE_RIGHT | FACE_TOP | FACE_FRONT } // 4: (-1, -1, +1) -> { FACE_LEFT | FACE_BOTTOM | FACE_BACK } // 5: (+1, -1, +1) -> { FACE_RIGHT | FACE_BOTTOM | FACE_BACK } // 6: (-1, +1, +1) -> { FACE_LEFT | FACE_TOP | FACE_BACK } // 7: (+1, +1, +1) -> { FACE_RIGHT | FACE_TOP | FACE_BACK } // ((v & 1) == 0) ? 1 : 2) | ((v & 2) == 0) ? 4 : 8) | ((v & 4) == 0) ? 16 : 32) uint f = (FACE_LEFT << BitFieldExtract(v, 0, 1)) | (FACE_BOTTOM << BitFieldExtract(v, 1, 1)) | (FACE_FRONT << BitFieldExtract(v, 2, 1)); return f; }; float3 GenerateVertexOfStandardCube(uint v) { float3 p; p.x = ((v & 1) == 0) ? -1 : 1; // FACE_LEFT : FACE_RIGHT p.y = ((v & 2) == 0) ? -1 : 1; // FACE_BOTTOM : FACE_TOP p.z = ((v & 4) == 0) ? -1 : 1; // FACE_FRONT : FACE_BACK return p; } uint GetVertexListOfFace(uint f) { // Warning: don't add 'static' here unless you want really bad code gen. const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT, (VERT_LIST_TOP << 12) | VERT_LIST_BOTTOM, (VERT_LIST_BACK << 12) | VERT_LIST_FRONT); return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12); } // 5 arrays * 128 elements * 4 bytes each = 2560 bytes. groupshared float gs_HapVertsX[VERTS_PER_GROUP]; groupshared float gs_HapVertsY[VERTS_PER_GROUP]; groupshared float gs_HapVertsZ[VERTS_PER_GROUP]; groupshared float gs_HapVertsW[VERTS_PER_GROUP]; groupshared uint gs_BehindMasksOfVerts[VERTS_PER_GROUP]; // 6 planes each (HLSL does not support small data types) #ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS // 1 array * 16 elements * 4 bytes each = 64 bytes. groupshared uint gs_CullClipFaceMasks[LIGHTS_PER_GROUP]; // 6 faces each (HLSL does not support small data types) // 8 arrays * 16 elements * 4 bytes each = 512 bytes. // These are actually floats reinterpreted as uints. // The reason is because floating-point atomic operations are not supported. groupshared uint gs_NdcAaBbMinPtX[LIGHTS_PER_GROUP]; groupshared uint gs_NdcAaBbMaxPtX[LIGHTS_PER_GROUP]; groupshared uint gs_NdcAaBbMinPtY[LIGHTS_PER_GROUP]; groupshared uint gs_NdcAaBbMaxPtY[LIGHTS_PER_GROUP]; groupshared uint gs_NdcAaBbMinPtZ[LIGHTS_PER_GROUP]; // Note that min-max Z cannot be trivially reconstructed groupshared uint gs_NdcAaBbMaxPtZ[LIGHTS_PER_GROUP]; // from min-max W if the projection is oblique. groupshared uint gs_NdcAaBbMinPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate groupshared uint gs_NdcAaBbMaxPtW[LIGHTS_PER_GROUP]; // View-space Z coordinate #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS // ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP]; groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP]; groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP]; groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP]; float4 GetFromRingBuffer(uint threadIdx, uint entry) { float4 outV; outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry]; outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry]; outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry]; outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry]; return outV; } void WriteToRingBuffer(uint threadIdx, uint entry, float4 value) { gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x; gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y; gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z; gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w; } ///////////////////////////////////////////////////////// // Returns 'true' if it manages to cull the face. bool TryCullFace(uint f, uint baseOffsetVertex) { uint cullMaskOfFace = FACE_MASK; // Initially behind uint vertListOfFace = GetVertexListOfFace(f); for (uint j = 0; j < 4; j++) { uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ALL the vertices are behind any of the planes. cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v]; } return (cullMaskOfFace != 0); } struct ClipVertex { float4 pt; // Homogeneous coordinate after perspective float bc; // Boundary coordinate with respect to the plane 'p' }; ClipVertex CreateClipVertex(uint p, float4 v) { bool evenPlane = (p & 1) == 0; float c = v[p >> 1]; float w = v.w; ClipVertex cv; cv.pt = v; cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex); return cv; } float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1) { float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1 return lerp(v0.pt, v1.pt, alpha); } void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize, uint threadIdx, out uint dstBegin, out uint dstSize) { dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here dstSize = 0; ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS)); #ifdef OBTUSE_COMPILER uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; uint modDstIdx = dstBegin % MAX_CLIP_VERTS; #endif for (uint j = srcBegin; j < (srcBegin + srcSize); j++) { #ifndef OBTUSE_COMPILER uint modSrcIdx = j % MAX_CLIP_VERTS; #endif ClipVertex leadVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, modSrcIdx)); // Execute Blinn's line clipping algorithm. // Classify the line segment. 4 cases: // 0. v0 out, v1 out -> add nothing // 1. v0 in, v1 out -> add intersection // 2. v0 out, v1 in -> add intersection, add v1 // 3. v0 in, v1 in -> add v1 // (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0. if ((tailVert.bc >= 0) != (leadVert.bc >= 0)) { // The line segment is guaranteed to cross the plane. float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert); #ifndef OBTUSE_COMPILER uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS; #endif WriteToRingBuffer(threadIdx, modDstIdx, clipVert); #ifdef OBTUSE_COMPILER dstSize++; modDstIdx++; modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx; #endif } if (leadVert.bc >= 0) { #ifndef OBTUSE_COMPILER uint modDstIdx = (dstBegin + dstSize++) % MAX_CLIP_VERTS; #endif WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt); //vertRingBuffer[modDstIdx] = leadVert.pt; #ifdef OBTUSE_COMPILER dstSize++; modDstIdx++; modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx; #endif } #ifdef OBTUSE_COMPILER modSrcIdx++; modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx; #endif tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer } } void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset, out uint srcBegin, out uint srcSize, uint threadIdx) { srcBegin = 0; srcSize = 4; uint clipMaskOfFace = 0; // Initially in front uint vertListOfFace = GetVertexListOfFace(f); for (uint j = 0; j < 4; j++) { uint v = BitFieldExtract(vertListOfFace, 3 * j, 3); // Non-zero if ANY of the vertices are behind any of the planes. clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v]; // Not all edges may require clipping. However, filtering the vertex list // is somewhat expensive, so we currently don't do it. WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v])); //vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v]; //vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v]; //vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v]; //vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v]; } // Sutherland-Hodgeman polygon clipping algorithm. // It works by clipping the entire polygon against one clipping plane at a time. while (clipMaskOfFace != 0) { uint p = firstbitlow(clipMaskOfFace); uint dstBegin, dstSize; ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize); srcBegin = dstBegin; srcSize = dstSize; clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow() } } void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx, bool isOrthoProj, float4x4 invProjMat, inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt) { #ifdef OBTUSE_COMPILER uint modSrcIdx = srcBegin % MAX_CLIP_VERTS; #endif for (uint j = srcBegin; j < (srcBegin + srcSize); j++) { #ifndef OBTUSE_COMPILER uint modSrcIdx = j % MAX_CLIP_VERTS; #endif float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx); // Clamp to the bounds in case of numerical errors (may still generate -0). float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); float rbpVertVSz = hapVert.w; if (isOrthoProj) // Must replace (w = 1) { rbpVertVSz = dot(invProjMat[2], hapVert); } ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz)); ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz)); #ifdef OBTUSE_COMPILER modSrcIdx++; modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx; #endif } } // Given: 'C' is the center of the sphere in the view space, 'r' is its radius; // 'projScale' and 'projOffset' are used to perform projection of the X (or Y) component of a vector. float2 ComputeBoundsOfSphereOnProjectivePlane(float3 C, float r, float projScale, float projOffset) { float xMin, xMax; // See sec. 8.2.1 of https://foundationsofgameenginedev.com/#fged2 for an alternative derivation. // Goal: find the planes that pass through the origin O, bound the sphere, and form // an axis-aligned rectangle at the intersection with the projection plane. // Solution (for the X-coordinate): // The intersection of the bounding planes and the projection plane must be vertical lines, // which means that the bounding planes must be tangent to the Y-axis. // The bounding planes must be also tangent to the sphere. // Call the intersection points of the two vertical bounding planes and the bounding // sphere B and D. Assume that B is on the left of C; D is on the right of C. // Note that C may be behind the origin, so the same generally goes for B and D. // BC is normal w.r.t. the bounding plane, so it is normal w.r.t. the Y-axis; |BC| = r. // As a consequence, it lies in a plane parallel to the the O-X-Z plane. // Consider B'C', which is an orthogonal projection of BC onto the actual O-X-Z plane. // (Imagine sliding the sphere up or down between the bounding planes). // We then consider a triangle OB'C' that lies entirely in the O-X-Z plane. // The coordinates are: OB' = (b.x, 0, b.z), OC' = (c.x, 0, c.z). float3 B, D; // OBC is a right triangle. So is OB'C'. // |BC| = |B'C'| = r. // |OB'|^2 = |OC'|^2 - |B'C'|^2. float lenSqOC = dot(C.xz, C.xz); float lenSqOB = lenSqOC - r * r; // If |OB'| = 0 or |OC'| = 0, the bounding planes tangent to the sphere do not exist. if (lenSqOB > 0) { float lenOB = sqrt(lenSqOB); // |OB' x OC'| = |OB'| * |OC'| * Sin[a']. // OB' . OC' = |OB'| * |OC'| * Cos[a']. // We can determine Sin[a'] = |B'C'| / |OC'| = R / |OC'|. // Cos[a'] = Sqrt[1 - Sin[a']^2]. // (OB' x OC') points along Y. // (OB' x OC').y = b.z * c.x - b.x * c.z. // Therefore, b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a']. // OB' . OC' = b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a']. // Since we don't care about the scale, and |OB'| != 0 and |OC'| != 0, // we can equivalently solve // z * c.x - x * c.z = |OC'|^3 * Sin[a']. // x * c.x + z * c.z = |OC'|^3 * Cos[a']. // With 2 equations and 2 unknowns, we can easily solve this linear system. // The solutions is // x = -c.z * r + c.x * |OB'|. // z = c.x * r + c.z * |OB'|. B.x = C.x * lenOB - (C.z * r); B.z = C.z * lenOB + (C.x * r); // (OD' x OC') points along Y. // (OD' x OC').y = d.z * c.x - d.x * c.z. // We must solve // z * c.x - x * c.z = -|OC'|^3 * Sin[a']. // x * c.x + z * c.z = |OC'|^3 * Cos[a']. // The solution is // x = c.z * r + c.x * |OB'|. // z = -c.x * r + c.z * |OB'|. D.x = C.x * lenOB + (C.z * r); D.z = C.z * lenOB - (C.x * r); // We can transform OB and OD as direction vectors. // For the simplification below, see OptimizeProjectionMatrix. float rapBx = (B.x * rcp(B.z)) * projScale + projOffset; float rapDx = (D.x * rcp(D.z)) * projScale + projOffset; // One problem with the above is that this direction may, for certain spheres, // point behind the origin (B.z <= 0 or D.z <= 0). // At this point we know that the sphere at least *partially* in front of the origin, // and that it is we are not inside the sphere, so there is at least one valid // plane (and one valid direction). We just need the second direction to go "in front" // of the first one to extend the bounding box. xMin = (B.z > 0) ? rapBx : -FLT_INF; xMax = (D.z > 0) ? rapDx : FLT_INF; } else { // Conservative estimate (we do not cull the bounding sphere using the view frustum). xMin = -1; xMax = 1; } return float2(xMin, xMax); } //********************************************************************************************** // The goal of this program is to compute the AABB of the light in the NDC space ([0, 1] range). // The light is represented by a convex volume (a cuboid) with 6 faces (planar quads) and 8 vertices. // // Since a light volume may be partially off-screen, we must clip it before computing the AABB. // Clipping the resulting AABB (rather than the light volume itself) may result in a loose AABB. // // To avoid having to deal with the "Moebius twist" property of the perspective transform, // we perform clipping using the homogeneous (projective) post-perspective coordinates. // This clipping method in described in Blinn's paper titled "Line Clipping". // // The algorithm processes a light on 4 threads. While all 6 faces may require clipping in the // worst case, clipping more than 4 faces is very uncommon (typically, we clip 0, 3 or 4). // Some faces may require culling rather than clipping (the former is simpler). // // It's important to realize that face culling may end up culling 5 (or even all 6) faces. // This means that the clipped light volume may be reduced to a single polygon, or nothing at all. // (Imagine a view volume completely or partially inside a light volume). // Therefore, we must perform view-volume-corner-inside-light-volume tests. // // // Notation: // rbp - real (3D) coordinates before perspective // hbp - hom. (4D) coordinates before perspective // hap - hom. (4D) coordinates after perspective // rap - real (3D) coordinates after perspective (after division by w) // ********************************************************************************************* [numthreads(THREADS_PER_GROUP, 1, 1)] void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID) { const uint t = threadID; const uint g = groupID.x; const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1 const uint intraGroupLightIndex = t / THREADS_PER_LIGHT; const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex; const uint baseVertexOffset = intraGroupLightIndex * NUM_VERTS; const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex); const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset]; const float4x4 projMat = g_mProjectionArr[eyeIndex]; const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex]; // Bounding frustum. const float3 rbpC = cullData.center.xyz; // View-space const float3 rbpX = cullData.boxAxisX.xyz; // Pre-scaled const float3 rbpY = cullData.boxAxisY.xyz; // Pre-scaled const float3 rbpZ = cullData.boxAxisZ.xyz; // Pre-scaled const float scale = cullData.scaleXY; // scale.x = scale.y // Bounding sphere. const float radius = cullData.radius; #ifndef PLATFORM_SUPPORTS_WAVE_INTRINSICS // (0) Initialize the TGSM. if (t % THREADS_PER_LIGHT == 0) // Avoid bank conflicts { gs_CullClipFaceMasks[intraGroupLightIndex] = 0; // Initially inside gs_NdcAaBbMinPtX[intraGroupLightIndex] = asuint(1.0f); gs_NdcAaBbMaxPtX[intraGroupLightIndex] = asuint(0.0f); gs_NdcAaBbMinPtY[intraGroupLightIndex] = asuint(1.0f); gs_NdcAaBbMaxPtY[intraGroupLightIndex] = asuint(0.0f); gs_NdcAaBbMinPtZ[intraGroupLightIndex] = asuint(1.0f); gs_NdcAaBbMaxPtZ[intraGroupLightIndex] = asuint(0.0f); gs_NdcAaBbMinPtW[intraGroupLightIndex] = asuint(FLT_INF); gs_NdcAaBbMaxPtW[intraGroupLightIndex] = asuint(0.0f); } #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS float4 ndcAaBbMinPt = float4(1, 1, 1, FLT_INF); float4 ndcAaBbMaxPt = 0; // We must determine whether we have to clip or cull any of the faces. // If all vertices of a face are inside with respect to all the culling planes, // we can trivially accept that face. If all vertices of a face are behind // any single plane, we can trivially reject (cull) that face. uint cullClipFaceMask = 0; // Initially inside uint i; // Avoid multiply-declared variable warning // (1) Compute the vertices of the light volume. for (i = 0; i < VERTS_PER_THREAD; i++) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; // rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1) // rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1) // rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1) // rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1) // rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; (-1, -1, +1) // rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; (+1, -1, +1) // rbpVerts[6] = rbpC - rbpX + rbpY + rbpZ; (-1, +1, +1) // rbpVerts[7] = rbpC + rbpX + rbpY + rbpZ; (+1, +1, +1) float3 m = GenerateVertexOfStandardCube(v); m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale] float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ; // Avoid generating (w = 0). rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN; float4 hapVert = mul(projMat, float4(rbpVertVS, 1)); // Warning: the W component may be negative. // Flipping the -W pyramid by negating all coordinates is incorrect // and will break both classification and clipping. // For the orthographic projection, (w = 1). // Transform the X and Y components: [-w, w] -> [0, w]. hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w); // For each vertex, we must determine whether it is within the bounds. // For culling and clipping, we must know, per culling plane, whether the vertex // is in the positive or the negative half-space. uint behindMask = 0; // Initially in front // Consider the vertex to be inside the view volume if: // 0 <= x <= w // 0 <= y <= w <-- include boundary points to avoid clipping them later // 0 <= z <= w // w is always valid // TODO: epsilon for numerical robustness? for (uint j = 0; j < (NUM_PLANES / 2); j++) { float w = hapVert.w; behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0' behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w' } if (behindMask == 0) // Inside? { // Clamp to the bounds in case of numerical errors (may still generate -0). float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w)); ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z)); ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z)); } else // Outside { // Mark all the faces of the bounding frustum associated with this vertex. cullClipFaceMask |= GetFaceMaskOfVertex(v); } gs_HapVertsX[baseVertexOffset + v] = hapVert.x; gs_HapVertsY[baseVertexOffset + v] = hapVert.y; gs_HapVertsZ[baseVertexOffset + v] = hapVert.z; gs_HapVertsW[baseVertexOffset + v] = hapVert.w; gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask; } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB cullClipFaceMask |= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); GroupMemoryBarrierWithGroupSync(); cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex]; #endif // (2) Test the corners of the view volume. if (cullClipFaceMask != 0) { // The light is partially outside the view volume. // Therefore, some of the corners of the view volume may be inside the light volume. // We perform aggressive culling, so we must make sure they are accounted for. // The light volume is a special type of cuboid - a right frustum. // We can exploit this fact by building a light-space projection matrix. // P_v = T * (R * S) * P_l // P_l = (R * S)^{-1} * T^{-1} * P_v float4x4 invTranslateToLightSpace = Translation4x4(-rbpC); float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ))); // TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly. // This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube. float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace); if (scale != 1) // Perspective light space? { // Compute the parameters of the perspective projection. float s = scale; float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye float n = -e - 1; // Distance from the eye to the near plane float f = -e + 1; // Distance from the eye to the far plane float g = f; // Distance from the eye to the projection plane float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e)); float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f); lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix); } for (i = 0; i < VERTS_PER_THREAD; i++) { uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; float3 rapVertCS = GenerateVertexOfStandardCube(v); rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1] float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space // Consider the vertex to be inside the light volume if: // -w < x < w // -w < y < w <-- exclude boundary points, as we will not clip using these vertices // -w < z < w <-- assume that Z-precision is not very important here // 0 < w // TODO: epsilon for numerical robustness? bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w; if (inside) { float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z); float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w); ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz)); ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz)); } } } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS GroupMemoryBarrierWithGroupSync(); #endif // (3) Cull the faces. { const uint cullFaceMask = cullClipFaceMask; const uint numFacesToCull = countbits(cullFaceMask); // [0, 6] for (i = 0; i < FACES_PER_THREAD; i++) { uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; if (n < numFacesToCull) { uint f = NthBitLow(cullFaceMask, n); if (TryCullFace(f, baseVertexOffset)) { cullClipFaceMask ^= 1 << f; // Clear the bit } } } } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB cullClipFaceMask &= LaneSwizzle(cullClipFaceMask, andMask, orMask, xorMask); } #else InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask); GroupMemoryBarrierWithGroupSync(); cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex]; #endif // (4) Clip the faces. { const uint clipFaceMask = cullClipFaceMask; const uint numFacesToClip = countbits(clipFaceMask); // [0, 6] for (i = 0; i < FACES_PER_THREAD; i++) { uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT; if (n < numFacesToClip) { uint f = NthBitLow(clipFaceMask, n); uint srcBegin, srcSize; ClipFaceAgainstViewVolume(f, baseVertexOffset, srcBegin, srcSize, t); UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat, ndcAaBbMinPt, ndcAaBbMaxPt); } } } #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS for (i = 0; i < FastLog2(THREADS_PER_LIGHT); i++) { uint andMask = PLATFORM_LANE_COUNT - 1; // All lanes uint orMask = 0; // Plays no role uint xorMask = 1 << i; // Flip bits one by one starting from the LSB ndcAaBbMinPt.x = min(ndcAaBbMinPt.x, LaneSwizzle(ndcAaBbMinPt.x, andMask, orMask, xorMask)); ndcAaBbMaxPt.x = max(ndcAaBbMaxPt.x, LaneSwizzle(ndcAaBbMaxPt.x, andMask, orMask, xorMask)); ndcAaBbMinPt.y = min(ndcAaBbMinPt.y, LaneSwizzle(ndcAaBbMinPt.y, andMask, orMask, xorMask)); ndcAaBbMaxPt.y = max(ndcAaBbMaxPt.y, LaneSwizzle(ndcAaBbMaxPt.y, andMask, orMask, xorMask)); ndcAaBbMinPt.z = min(ndcAaBbMinPt.z, LaneSwizzle(ndcAaBbMinPt.z, andMask, orMask, xorMask)); ndcAaBbMaxPt.z = max(ndcAaBbMaxPt.z, LaneSwizzle(ndcAaBbMaxPt.z, andMask, orMask, xorMask)); ndcAaBbMinPt.w = min(ndcAaBbMinPt.w, LaneSwizzle(ndcAaBbMinPt.w, andMask, orMask, xorMask)); ndcAaBbMaxPt.w = max(ndcAaBbMaxPt.w, LaneSwizzle(ndcAaBbMaxPt.w, andMask, orMask, xorMask)); } #else // Integer comparison works for floating-point numbers as long as the sign bit is 0. // We must take care of -0 ourselves. saturate() does not help. InterlockedMin(gs_NdcAaBbMinPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.x))); InterlockedMax(gs_NdcAaBbMaxPtX[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.x))); InterlockedMin(gs_NdcAaBbMinPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.y))); InterlockedMax(gs_NdcAaBbMaxPtY[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.y))); InterlockedMin(gs_NdcAaBbMinPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.z))); InterlockedMax(gs_NdcAaBbMaxPtZ[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.z))); InterlockedMin(gs_NdcAaBbMinPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMinPt.w))); InterlockedMax(gs_NdcAaBbMaxPtW[intraGroupLightIndex], asuint(CLEAR_SIGN_BIT(ndcAaBbMaxPt.w))); GroupMemoryBarrierWithGroupSync(); ndcAaBbMinPt.x = asfloat(gs_NdcAaBbMinPtX[intraGroupLightIndex]); ndcAaBbMaxPt.x = asfloat(gs_NdcAaBbMaxPtX[intraGroupLightIndex]); ndcAaBbMinPt.y = asfloat(gs_NdcAaBbMinPtY[intraGroupLightIndex]); ndcAaBbMaxPt.y = asfloat(gs_NdcAaBbMaxPtY[intraGroupLightIndex]); ndcAaBbMinPt.z = asfloat(gs_NdcAaBbMinPtZ[intraGroupLightIndex]); ndcAaBbMaxPt.z = asfloat(gs_NdcAaBbMaxPtZ[intraGroupLightIndex]); ndcAaBbMinPt.w = asfloat(gs_NdcAaBbMinPtW[intraGroupLightIndex]); ndcAaBbMaxPt.w = asfloat(gs_NdcAaBbMaxPtW[intraGroupLightIndex]); #endif // PLATFORM_SUPPORTS_WAVE_INTRINSICS // (5) Compute the AABB of the bounding sphere. if (radius > 0) { // Occasionally, an intersection of AABBs of a bounding sphere and a bounding frustum // results in a tighter AABB when compared to using the AABB of the frustum alone. // That is the case (mostly) for sphere-capped spot lights with very wide angles. // Note that, unfortunately, it is not quite as tight as an AABB of a CSG intersection // of a sphere and frustum. Also note that the algorithm below doesn't clip the bounding // sphere against the view frustum before computing the bounding box, simply because it is // too hard/expensive. I will leave it as a TODO in case someone wants to tackle this problem. if ((rbpC.z + radius) > 0) // Is the sphere at least *partially* in front of the origin? { ndcAaBbMinPt.w = max(ndcAaBbMinPt.w, rbpC.z - radius); ndcAaBbMaxPt.w = min(ndcAaBbMaxPt.w, rbpC.z + radius); // Computing the 'z' component for an arbitrary projection matrix is hard, so we don't do it. // See sec. 8.2.2 of https://foundationsofgameenginedev.com/#fged2 for a solution. float2 rectMin, rectMax; // For the 'x' and 'y' components, the solution is given below. if (g_isOrthographic) { // Compute the center and the extents (half-diagonal) of the bounding box. float2 center = mul(projMat, float4(rbpC.xyz, 1)).xy; float2 extents = mul(projMat, float4(radius.xx, 0, 0)).xy; rectMin = center - extents; rectMax = center + extents; } else // Perspective { float2 xBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.xxz, radius, projMat._m00, projMat._m02); // X-Z plane float2 yBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.yyz, radius, projMat._m11, projMat._m12); // Y-Z plane rectMin = float2(xBounds.r, yBounds.r); rectMax = float2(xBounds.g, yBounds.g); } // Transform to the NDC coordinates. rectMin = rectMin * 0.5 + 0.5; rectMax = rectMax * 0.5 + 0.5; // Note: separating the X- and Y-computations across 2 threads is not worth it. ndcAaBbMinPt.xy = max(ndcAaBbMinPt.xy, rectMin); ndcAaBbMaxPt.xy = min(ndcAaBbMaxPt.xy, rectMax); } } if ((globalLightIndex < (uint)g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts { // For stereo, we have two sets of lights. Therefore, each eye has a set of mins // followed by a set of maxs, and each set is equal to g_iNrVisibLights. const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex); g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt; g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt; } }