// Copied & modified from ColorPyramid.compute

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"

#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch

#pragma kernel KMain          MAIN=KMain
#pragma kernel KDownsample    MAIN=KDownsample  DOWNSAMPLE


TEXTURE2D_X(_InputTexture);

RW_TEXTURE2D_X(float3, _OutputTexture);

CBUFFER_START(cb0)
    float4 _TexelSize;       // xy: size, zw: texel size
CBUFFER_END

// 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color
// channels packed together.
// The reason for separating channels is to reduce bank conflicts in the local data memory
// controller. A large stride will cause more threads to collide on the same memory bank.
groupshared uint gs_cacheR[128];
groupshared uint gs_cacheG[128];
groupshared uint gs_cacheB[128];

float3 BlurPixels(float3 a, float3 b, float3 c, float3 d, float3 e, float3 f, float3 g, float3 h, float3 i)
{
    return 0.27343750 * (e    )
         + 0.21875000 * (d + f)
         + 0.10937500 * (c + g)
         + 0.03125000 * (b + h)
         + 0.00390625 * (a + i);
}

void Store2Pixels(uint index, float3 pixel1, float3 pixel2)
{
    gs_cacheR[index] = f32tof16(pixel1.r) | f32tof16(pixel2.r) << 16;
    gs_cacheG[index] = f32tof16(pixel1.g) | f32tof16(pixel2.g) << 16;
    gs_cacheB[index] = f32tof16(pixel1.b) | f32tof16(pixel2.b) << 16;
}

void Load2Pixels(uint index, out float3 pixel1, out float3 pixel2)
{
    uint rr = gs_cacheR[index];
    uint gg = gs_cacheG[index];
    uint bb = gs_cacheB[index];
    pixel1 = float3(f16tof32(rr      ), f16tof32(gg      ), f16tof32(bb      ));
    pixel2 = float3(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16));
}

void Store1Pixel(uint index, float3 pixel)
{
    gs_cacheR[index] = asuint(pixel.r);
    gs_cacheG[index] = asuint(pixel.g);
    gs_cacheB[index] = asuint(pixel.b);
}

void Load1Pixel(uint index, out float3 pixel)
{
    pixel = asfloat(uint3(gs_cacheR[index], gs_cacheG[index], gs_cacheB[index]));
}

// Blur two pixels horizontally. This reduces LDS reads and pixel unpacking.
void BlurHorizontally(uint outIndex, uint leftMostIndex)
{
    float3 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
    Load2Pixels(leftMostIndex + 0, s0, s1);
    Load2Pixels(leftMostIndex + 1, s2, s3);
    Load2Pixels(leftMostIndex + 2, s4, s5);
    Load2Pixels(leftMostIndex + 3, s6, s7);
    Load2Pixels(leftMostIndex + 4, s8, s9);

    Store1Pixel(outIndex    , BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8));
    Store1Pixel(outIndex + 1, BlurPixels(s1, s2, s3, s4, s5, s6, s7, s8, s9));
}

void BlurVertically(uint2 pixelCoord, uint topMostIndex)
{
    float3 s0, s1, s2, s3, s4, s5, s6, s7, s8;
    Load1Pixel(topMostIndex     , s0);
    Load1Pixel(topMostIndex +  8, s1);
    Load1Pixel(topMostIndex + 16, s2);
    Load1Pixel(topMostIndex + 24, s3);
    Load1Pixel(topMostIndex + 32, s4);
    Load1Pixel(topMostIndex + 40, s5);
    Load1Pixel(topMostIndex + 48, s6);
    Load1Pixel(topMostIndex + 56, s7);
    Load1Pixel(topMostIndex + 64, s8);

    float3 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);

    // Guard bands
    blurred *= all(pixelCoord < uint2(_TexelSize.xy));

    // Write to the final target
    _OutputTexture[COORD_TEXTURE2D_X(pixelCoord)] = blurred;
}

#define GROUP_SIZE 8

[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void MAIN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);

    // Upper-left pixel coordinate of quad that this thread will read
    int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4;

#if DOWNSAMPLE
    float2 offset = float2(threadUL);
    float2 maxCoord = 1.0 - 0.5f *_TexelSize.zw;
    float3 p00 = SAMPLE_TEXTURE2D_X_LOD(_InputTexture, s_linear_clamp_sampler, ClampAndScaleUVForBilinear((offset                    + 0.5) * _TexelSize.zw, _TexelSize.zw), 0.0).xyz;
    float3 p10 = SAMPLE_TEXTURE2D_X_LOD(_InputTexture, s_linear_clamp_sampler, ClampAndScaleUVForBilinear((offset + float2(1.0, 0.0) + 0.5) * _TexelSize.zw, _TexelSize.zw), 0.0).xyz;
    float3 p01 = SAMPLE_TEXTURE2D_X_LOD(_InputTexture, s_linear_clamp_sampler, ClampAndScaleUVForBilinear((offset + float2(0.0, 1.0) + 0.5) * _TexelSize.zw, _TexelSize.zw), 0.0).xyz;
    float3 p11 = SAMPLE_TEXTURE2D_X_LOD(_InputTexture, s_linear_clamp_sampler, ClampAndScaleUVForBilinear((offset + float2(1.0, 1.0) + 0.5) * _TexelSize.zw, _TexelSize.zw), 0.0).xyz;
#else
    uint2 uthreadUL = uint2(max(0, threadUL));
    uint2 size = uint2(_TexelSize.xy) - 1u;
    float3 p00 = _InputTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 0u), size))].xyz;
    float3 p10 = _InputTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 0u), size))].xyz;
    float3 p11 = _InputTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 1u), size))].xyz;
    float3 p01 = _InputTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 1u), size))].xyz;
#endif

    // Store the 4 downsampled pixels in LDS
    uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
    Store2Pixels(destIdx     , p00, p10);
    Store2Pixels(destIdx + 8u, p01, p11);

    GroupMemoryBarrierWithGroupSync();

    // Horizontally blur the pixels in LDS
    uint row = groupThreadId.y << 4u;
    BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));

    GroupMemoryBarrierWithGroupSync();

    // Vertically blur the pixels in LDS and write the result to memory
    BlurVertically(dispatchThreadId.xy, (groupThreadId.y << 3u) + groupThreadId.x);
}