Arcade-Maniac/Source/Library/PackageCache/com.unity.render-pipelines.high-definition@11.0.0/Runtime/ShaderLibrary/ResolveStencilBuffer.compute

#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch

#pragma kernel MAIN         KERNEL_NAME=MAIN         NUM_SAMPLES=1      COARSE_STENCIL
#pragma kernel MAIN_MSAA_2  KERNEL_NAME=MAIN_MSAA_2  NUM_SAMPLES=2 MSAA COARSE_STENCIL
#pragma kernel MAIN_MSAA_4  KERNEL_NAME=MAIN_MSAA_4  NUM_SAMPLES=4 MSAA COARSE_STENCIL
#pragma kernel MAIN_MSAA_8  KERNEL_NAME=MAIN_MSAA_8  NUM_SAMPLES=8 MSAA COARSE_STENCIL

#pragma kernel MAIN_MSAA_2_RESOLVE  KERNEL_NAME=MAIN_MSAA_2_RESOLVE  NUM_SAMPLES=2 MSAA RESOLVE COARSE_STENCIL
#pragma kernel MAIN_MSAA_4_RESOLVE  KERNEL_NAME=MAIN_MSAA_4_RESOLVE  NUM_SAMPLES=4 MSAA RESOLVE COARSE_STENCIL
#pragma kernel MAIN_MSAA_8_RESOLVE  KERNEL_NAME=MAIN_MSAA_8_RESOLVE  NUM_SAMPLES=8 MSAA RESOLVE COARSE_STENCIL

#pragma kernel MAIN_MSAA_2_RESOLVE_ONLY  KERNEL_NAME=MAIN_MSAA_2_RESOLVE_ONLY  NUM_SAMPLES=2 MSAA RESOLVE
#pragma kernel MAIN_MSAA_4_RESOLVE_ONLY  KERNEL_NAME=MAIN_MSAA_4_RESOLVE_ONLY  NUM_SAMPLES=4 MSAA RESOLVE
#pragma kernel MAIN_MSAA_8_RESOLVE_ONLY  KERNEL_NAME=MAIN_MSAA_8_RESOLVE_ONLY  NUM_SAMPLES=8 MSAA RESOLVE


#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"

#ifdef MSAA
TEXTURE2D_X_MSAA(uint2, _StencilTexture);
RW_TEXTURE2D_X(uint2, _OutputStencilBuffer);
#else
TEXTURE2D_X_UINT2(_StencilTexture);
#endif

// TODO: Wasting 3 bytes here per entry, but still better than a texture as can be scalar read.
// We could sub-index the right byte inside the uint, but it takes extra ALU and won't save bandwidth (just memory)
// For now the extra memory cost is acceptable (3 bytes * 1/64th of a render target).
// Note that using RawBuffers seems to have problem, so using structured buffers for now, but is worth revisiting if the perf difference is a concern.
RWStructuredBuffer<uint>    _CoarseStencilBuffer;

#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
    #define USE_INTRINSICS (PLATFORM_LANE_COUNT == 64)
#else
    #define USE_INTRINSICS 0
#endif

#if USE_INTRINSICS == 0
groupshared uint coarseStencilValue;
#endif

[numthreads(8, 8, 1)]
void KERNEL_NAME(uint3 groupId          : SV_GroupID,
                 uint3 groupThreadId    : SV_GroupThreadID,
                 uint3 dispatchThreadID : SV_DispatchThreadID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadID.z);

    // The best shot at resolving is being overly conservative, hence the OR operator. This is by nature inaccurate.
    uint resolvedStencil = 0;

    if (dispatchThreadID.x < (uint)_ScreenSize.x && dispatchThreadID.y < (uint)_ScreenSize.y)
    {
        UNITY_UNROLL
        for (uint i = 0; i < NUM_SAMPLES; i++)
        {
            uint2 sampledStencil;
#ifndef MSAA
            sampledStencil = LOAD_TEXTURE2D_X(_StencilTexture, dispatchThreadID.xy);
#else
            sampledStencil = LOAD_TEXTURE2D_X_MSAA(_StencilTexture, dispatchThreadID.xy, i);
#endif
            resolvedStencil = GetStencilValue(sampledStencil);
        }
    }
#ifdef RESOLVE
    _OutputStencilBuffer[COORD_TEXTURE2D_X(dispatchThreadID.xy)] = uint2(resolvedStencil, resolvedStencil);
#endif

#ifdef COARSE_STENCIL

#if USE_INTRINSICS

    // Need to workaround a warning incorrectly triggered when on Xbox One, so instead of using WaveIsFirstLane()
    // we check the groupThreadId as in the non intrinsic version.
    //bool isFirstThread = WaveIsFirstLane();
    bool isFirstThread = groupThreadId.x == 0 && groupThreadId.y == 0;
    uint coarseStencilValue = WaveActiveBitOr(resolvedStencil);


#else

    bool isFirstThread = groupThreadId.x == 0 && groupThreadId.y == 0;
    if (isFirstThread)
    {
        coarseStencilValue = 0;
    }

    GroupMemoryBarrierWithGroupSync();

    InterlockedOr(coarseStencilValue, resolvedStencil);

    GroupMemoryBarrierWithGroupSync();

#endif

    //This temp is needed outside the if(isFirstThread) condition to workaround a DXC DXIL codegen
    // issue https://github.com/microsoft/DirectXShaderCompiler/issues/2743 until it's fixed
    uint perThreadCoarseStencilValue = coarseStencilValue;

    if (isFirstThread)
    {
        uint addressIndex = Get1DAddressFromPixelCoord(groupId.xy, _CoarseStencilBufferSize.xy, groupId.z);
        _CoarseStencilBuffer[addressIndex] = perThreadCoarseStencilValue;
    }

#endif

}