#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch #pragma kernel MAIN KERNEL_NAME=MAIN NUM_SAMPLES=1 COARSE_STENCIL #pragma kernel MAIN_MSAA_2 KERNEL_NAME=MAIN_MSAA_2 NUM_SAMPLES=2 MSAA COARSE_STENCIL #pragma kernel MAIN_MSAA_4 KERNEL_NAME=MAIN_MSAA_4 NUM_SAMPLES=4 MSAA COARSE_STENCIL #pragma kernel MAIN_MSAA_8 KERNEL_NAME=MAIN_MSAA_8 NUM_SAMPLES=8 MSAA COARSE_STENCIL #pragma kernel MAIN_MSAA_2_RESOLVE KERNEL_NAME=MAIN_MSAA_2_RESOLVE NUM_SAMPLES=2 MSAA RESOLVE COARSE_STENCIL #pragma kernel MAIN_MSAA_4_RESOLVE KERNEL_NAME=MAIN_MSAA_4_RESOLVE NUM_SAMPLES=4 MSAA RESOLVE COARSE_STENCIL #pragma kernel MAIN_MSAA_8_RESOLVE KERNEL_NAME=MAIN_MSAA_8_RESOLVE NUM_SAMPLES=8 MSAA RESOLVE COARSE_STENCIL #pragma kernel MAIN_MSAA_2_RESOLVE_ONLY KERNEL_NAME=MAIN_MSAA_2_RESOLVE_ONLY NUM_SAMPLES=2 MSAA RESOLVE #pragma kernel MAIN_MSAA_4_RESOLVE_ONLY KERNEL_NAME=MAIN_MSAA_4_RESOLVE_ONLY NUM_SAMPLES=4 MSAA RESOLVE #pragma kernel MAIN_MSAA_8_RESOLVE_ONLY KERNEL_NAME=MAIN_MSAA_8_RESOLVE_ONLY NUM_SAMPLES=8 MSAA RESOLVE #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl" #ifdef MSAA TEXTURE2D_X_MSAA(uint2, _StencilTexture); RW_TEXTURE2D_X(uint2, _OutputStencilBuffer); #else TEXTURE2D_X_UINT2(_StencilTexture); #endif // TODO: Wasting 3 bytes here per entry, but still better than a texture as can be scalar read. // We could sub-index the right byte inside the uint, but it takes extra ALU and won't save bandwidth (just memory) // For now the extra memory cost is acceptable (3 bytes * 1/64th of a render target). // Note that using RawBuffers seems to have problem, so using structured buffers for now, but is worth revisiting if the perf difference is a concern. RWStructuredBuffer _CoarseStencilBuffer; #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS #define USE_INTRINSICS (PLATFORM_LANE_COUNT == 64) #else #define USE_INTRINSICS 0 #endif #if USE_INTRINSICS == 0 groupshared uint coarseStencilValue; #endif [numthreads(8, 8, 1)] void KERNEL_NAME(uint3 groupId : SV_GroupID, uint3 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadID : SV_DispatchThreadID) { UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadID.z); // The best shot at resolving is being overly conservative, hence the OR operator. This is by nature inaccurate. uint resolvedStencil = 0; if (dispatchThreadID.x < (uint)_ScreenSize.x && dispatchThreadID.y < (uint)_ScreenSize.y) { UNITY_UNROLL for (uint i = 0; i < NUM_SAMPLES; i++) { uint2 sampledStencil; #ifndef MSAA sampledStencil = LOAD_TEXTURE2D_X(_StencilTexture, dispatchThreadID.xy); #else sampledStencil = LOAD_TEXTURE2D_X_MSAA(_StencilTexture, dispatchThreadID.xy, i); #endif resolvedStencil = GetStencilValue(sampledStencil); } } #ifdef RESOLVE _OutputStencilBuffer[COORD_TEXTURE2D_X(dispatchThreadID.xy)] = uint2(resolvedStencil, resolvedStencil); #endif #ifdef COARSE_STENCIL #if USE_INTRINSICS // Need to workaround a warning incorrectly triggered when on Xbox One, so instead of using WaveIsFirstLane() // we check the groupThreadId as in the non intrinsic version. //bool isFirstThread = WaveIsFirstLane(); bool isFirstThread = groupThreadId.x == 0 && groupThreadId.y == 0; uint coarseStencilValue = WaveActiveBitOr(resolvedStencil); #else bool isFirstThread = groupThreadId.x == 0 && groupThreadId.y == 0; if (isFirstThread) { coarseStencilValue = 0; } GroupMemoryBarrierWithGroupSync(); InterlockedOr(coarseStencilValue, resolvedStencil); GroupMemoryBarrierWithGroupSync(); #endif //This temp is needed outside the if(isFirstThread) condition to workaround a DXC DXIL codegen // issue https://github.com/microsoft/DirectXShaderCompiler/issues/2743 until it's fixed uint perThreadCoarseStencilValue = coarseStencilValue; if (isFirstThread) { uint addressIndex = Get1DAddressFromPixelCoord(groupId.xy, _CoarseStencilBufferSize.xy, groupId.z); _CoarseStencilBuffer[addressIndex] = perThreadCoarseStencilValue; } #endif }