#pragma kernel SpatialFilterHalf                                SPATIAL_FILTER=SpatialFilterHalf HALF_RES
#pragma kernel SpatialFilter                                    SPATIAL_FILTER=SpatialFilter
#pragma kernel TemporalFilterHalf                               TEMPORAL_FILTER=TemporalFilterHalf HALF_RES
#pragma kernel TemporalFilter                                   TEMPORAL_FILTER=TemporalFilter
#pragma kernel CopyHistory                                      COPY_HISTORY=CopyHistory

// Common includes
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/CommonLighting.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Sampling/Sampling.hlsl"
#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"

// HDRP includes
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/NormalBuffer.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Builtin/BuiltinData.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/PostProcessing/Shaders/TemporalAntialiasing.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/Raytracing/Shaders/Denoising/BilateralFilter.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/RenderPipeline/Raytracing/Shaders/Denoising/DenoisingUtils.hlsl"

// #pragma only_renderers d3d11
// #pragma enable_d3d11_debug_symbols

// Tile size of this compute shaders
#define SSGI_FILTER_TILE_SIZE 8

// Noisy buffer input
TEXTURE2D_X(_InputNoisyBuffer0);
TEXTURE2D_X(_InputNoisyBuffer1);

// Constant buffer that should hold all the scalar values require
CBUFFER_START(UnityScreenSpaceGlobalIllumination)
    float2 _DepthPyramidFirstMipLevelOffset;
    float4 _HalfScreenSize;
    int _IndirectDiffuseSpatialFilter;
    float _HistoryValidity;
    float _PixelSpreadAngleTangent;
CBUFFER_END

// Denoised output buffer
RW_TEXTURE2D_X(float4, _OutputFilteredBuffer0);
RW_TEXTURE2D_X(float4, _OutputFilteredBuffer1);

#define SPATIAL_FILTER_SIGMA_RATIO 0.9
#define DEPTH_WEIGHT_MULTIPLIER 100.0f
#define NORMAL_WEIGHT_MULTIPLIER 5.0

// Number of samples to compelte the accumulation loop
#define NUM_SAMPLE_LOOP 8.0
#define EXPONENTIAL_ACCUMULATION_FACTOR (NUM_SAMPLE_LOOP/(NUM_SAMPLE_LOOP + 1.0f))

[numthreads(SSGI_FILTER_TILE_SIZE, SSGI_FILTER_TILE_SIZE, 1)]
void SPATIAL_FILTER(uint3 dispatchThreadId : SV_DispatchThreadID , uint2 groupThreadId : SV_GroupThreadID , uint2 groupId : SV_GroupID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);

    // Fetch the current pixel coordinates
    int2 centerCoord = dispatchThreadId.xy;

#if HALF_RES
    // Get the posinputs of the current version of the pixel
    float depth = LOAD_TEXTURE2D_X(_DepthTexture, asuint(_DepthPyramidFirstMipLevelOffset) + centerCoord).r;
    NormalData normalData;
    DecodeFromNormalBuffer(centerCoord * 2, normalData);
#else
    // Get the posinputs of the current version of the pixel
    float depth = LOAD_TEXTURE2D_X(_DepthTexture, centerCoord).r;
    NormalData normalData;
    DecodeFromNormalBuffer(centerCoord, normalData);
#endif

    // If the current point we are processing is a background point or the whole history should be discarded for an other reason, we invalidate the history
    if (depth == UNITY_RAW_FAR_CLIP_VALUE)
    {
        _OutputFilteredBuffer0[COORD_TEXTURE2D_X(centerCoord)] = LOAD_TEXTURE2D_X(_InputNoisyBuffer0, centerCoord);
        _OutputFilteredBuffer1[COORD_TEXTURE2D_X(centerCoord)] = LOAD_TEXTURE2D_X(_InputNoisyBuffer1, centerCoord);
        return;
    }

    // Convert the depth into linear for weight evaluation
    float centerz01 = Linear01Depth(depth, _ZBufferParams);

    // Initialize the accumulation buffers
    float4 colorSum0 = 0.0;
    float3 colorSum1 = 0.0;
    float wSum = 0.0;

    const float sigma = _IndirectDiffuseSpatialFilter * SPATIAL_FILTER_SIGMA_RATIO;

    // Loop through the neighboord and do our filter
    for (int y = -_IndirectDiffuseSpatialFilter; y <= _IndirectDiffuseSpatialFilter; ++y)
    {
        for (int x = -_IndirectDiffuseSpatialFilter; x <= _IndirectDiffuseSpatialFilter; ++x)
        {
            // This could probably be avoided, but the shader is badwidth bound anyway, so..
            float r = sqrt(x * x + y * y);

            // Compute the absolute tap coord
            int2 tapCoord = centerCoord + int2(x, y);

            // We should not tap outside of the screen (given that its a unit, if we go below zero we wrap around)
#if HALF_RES
            if (tapCoord.x >= _HalfScreenSize.x
                || tapCoord.x < 0
                || tapCoord.y >= _HalfScreenSize.y
                || tapCoord.y < 0)
                continue;
#else
            if (tapCoord.x >= _ScreenSize.x
                || tapCoord.x < 0
                || tapCoord.y >= _ScreenSize.y
                || tapCoord.y < 0)
                continue;
#endif

            // Read the depth of the tap pixel
#if HALF_RES
            float tapDepth = LOAD_TEXTURE2D_X(_DepthTexture, asuint(_DepthPyramidFirstMipLevelOffset) + tapCoord).r;
            NormalData normalDataTap;
            DecodeFromNormalBuffer(tapCoord * 2, normalDataTap);
#else
            float tapDepth = LOAD_TEXTURE2D_X(_DepthTexture, tapCoord).r;
            NormalData normalDataTap;
            DecodeFromNormalBuffer(tapCoord, normalDataTap);
#endif
            if (tapDepth == UNITY_RAW_FAR_CLIP_VALUE)
                continue;

            // Convert the tapped depth to linear for weight evaluation
            float tapz01 = Linear01Depth(tapDepth, _ZBufferParams);

            // Compute the depth for this pixel
            float depthWeight = max(0.0, 1.0 - abs(tapz01 - centerz01) * DEPTH_WEIGHT_MULTIPLIER);

            const float normalCloseness = sqr(sqr(max(0.0, dot(normalDataTap.normalWS, normalData.normalWS))));
            const float normalError = 1.0 - normalCloseness;
            const float normalWeight = max(0.0, (1.0 - normalError * NORMAL_WEIGHT_MULTIPLIER));

            // Compute the weight (skip computation for the center)
            const float w = r ? gaussian(r, sigma) * depthWeight * normalWeight : 1.0;

            // Accumuate this value
            colorSum0 += LOAD_TEXTURE2D_X(_InputNoisyBuffer0, tapCoord).xyzw * w;
            colorSum1 += LOAD_TEXTURE2D_X(_InputNoisyBuffer1, tapCoord).xyz * w;
            wSum += w;
        }
    }

    // Output the result to the buffer and propagate the w channel as is.
    // TODO: We could save bandwidth by doing this using a 111110 texture and storing the w in a different texture
    _OutputFilteredBuffer0[COORD_TEXTURE2D_X(centerCoord)] = float4(colorSum0 / wSum);
    _OutputFilteredBuffer1[COORD_TEXTURE2D_X(centerCoord)] = float4(colorSum1 / wSum, LOAD_TEXTURE2D_X(_InputNoisyBuffer1, centerCoord).w);
}

// History buffer input
TEXTURE2D_X(_HistoryBuffer0);
TEXTURE2D_X(_HistoryBuffer1);
// Depth buffer of the previous frame, this is either the full res depth or half res based on the variant of the shader
TEXTURE2D_X(_HistoryDepthTexture);

[numthreads(SSGI_FILTER_TILE_SIZE, SSGI_FILTER_TILE_SIZE, 1)]
void TEMPORAL_FILTER(uint3 dispatchThreadId : SV_DispatchThreadID
                    , uint2 groupThreadId : SV_GroupThreadID
                    , uint2 groupId : SV_GroupID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);

    int2 centerCoord = dispatchThreadId.xy;

    // Read the color as early as possible
    float4 color0 = LOAD_TEXTURE2D_X(_InputNoisyBuffer0, centerCoord);
    float4 color1 = LOAD_TEXTURE2D_X(_InputNoisyBuffer1, centerCoord);

#if HALF_RES
    // We need the full res coordinate for the inputs
    uint2 fullResCoord = centerCoord * 2;

    // Get the posinputs of the current version of the pixel
    float depth = LOAD_TEXTURE2D_X(_DepthTexture, asuint(_DepthPyramidFirstMipLevelOffset) + centerCoord).r;
    PositionInputs posInputs = GetPositionInput(fullResCoord, _ScreenSize.zw, depth, UNITY_MATRIX_I_VP, GetWorldToViewMatrix());
#else
    // Get the posinputs of the current version of the pixel
    float depth = LOAD_TEXTURE2D_X(_DepthTexture, centerCoord).r;
    PositionInputs posInputs = GetPositionInput(centerCoord, _ScreenSize.zw, depth, UNITY_MATRIX_I_VP, GetWorldToViewMatrix());
#endif

    // Initialize the output buffer in case of an early exit.
    _OutputFilteredBuffer0[COORD_TEXTURE2D_X(centerCoord)] = color0;
    _OutputFilteredBuffer1[COORD_TEXTURE2D_X(centerCoord)] = float4(color1.xyz, 1.0);

    // If the current point we are processing is a background point or the whole history should be discarded for an other reason, we invalidate the history
    if (depth == UNITY_RAW_FAR_CLIP_VALUE)
        return;

    // Decode the velocity of the pixel
    float2 velocity = float2(0.0, 0.0);
#if HALF_RES
    DecodeMotionVector(LOAD_TEXTURE2D_X(_CameraMotionVectorsTexture, (float2)fullResCoord), velocity);
#else
    DecodeMotionVector(LOAD_TEXTURE2D_X(_CameraMotionVectorsTexture, (float2)centerCoord), velocity);
#endif

    // Compute the pixel coordinate for the history tapping
    int2 historyTapCoord = (int2)((posInputs.positionNDC - velocity) * _ScreenSize.xy);
#if HALF_RES
    historyTapCoord *= 0.5;
    // If the pixel was outside of the screen during the previous frame, invalidate the history
    if (historyTapCoord.x >= _HalfScreenSize.x || historyTapCoord.x < 0
        || historyTapCoord.y >= _HalfScreenSize.y || historyTapCoord.y < 0)
        return;
#else
    // If the pixel was outside of the screen during the previous frame, invalidate the history
    if (historyTapCoord.x >= _ScreenSize.x || historyTapCoord.x < 0
        || historyTapCoord.y >= _ScreenSize.y || historyTapCoord.y < 0)
        return;
#endif

    // Fetch the depth of the history pixel. If the history position was a background point, invalidate the history
    float historyDepth = LOAD_TEXTURE2D_X(_HistoryDepthTexture, historyTapCoord).r;

    // If the history was a background pixel, skip it
    bool invalidHistory = false;
    if (historyDepth == UNITY_RAW_FAR_CLIP_VALUE || _HistoryValidity == 0.0)
        invalidHistory = true;

    // Compute the world space position (from previous frame)
    float3 historyPositionWS = ComputeWorldSpacePosition(posInputs.positionNDC - velocity, historyDepth, UNITY_MATRIX_PREV_I_VP);

    // Real the normal data for this pixel
    NormalData normalData;
    DecodeFromNormalBuffer(centerCoord, normalData);

    // Compute the max reprojection distance. This is evaluated as the max between a fixed radius value and an approximation of the footprint of the pixel.
    float maxRadius = ComputeMaxReprojectionWorldRadius(posInputs.positionWS, normalData.normalWS, _PixelSpreadAngleTangent);

    // Is it too far from the current position?
    if (length(historyPositionWS - posInputs.positionWS) > maxRadius)
        invalidHistory = true;

    // Fetch history data
    float4 history0 = LOAD_TEXTURE2D_X(_HistoryBuffer0, historyTapCoord);
    float4 history1 = LOAD_TEXTURE2D_X(_HistoryBuffer1, historyTapCoord);
    float sampleCount = history1.w;

    // Accumulation factor that tells us how much we need to keep the history data
    float accumulationFactor = 0.0;

    // If the history is invalid
    if (invalidHistory || sampleCount == 0.0)
    {
        // We only take the current value
        accumulationFactor = 0.0;

        // To avoid nan values
        history0 = 0.0;
        history1 = float4(0.501960784f, 0.501960784f, 0.0, 0.0);
        sampleCount = 1.0;
    }
    else
    {
        // Otherwise we compute the accumulation factor
        accumulationFactor = sampleCount >= NUM_SAMPLE_LOOP ? EXPONENTIAL_ACCUMULATION_FACTOR : (sampleCount / (sampleCount + 1.0));

        // Update the sample count
        sampleCount = min(sampleCount + 1.0, NUM_SAMPLE_LOOP);

        // If the history pixel was moving, we descide to throw partially the history
        // TODO: Expose this as a parameter
        if (color1.w > 0.0)
        {
            sampleCount = 5.0;
            accumulationFactor = sampleCount / (sampleCount + 1.0);
        }
    }

    // Do the accumulation based on the values we computed and store the new sample count in the w channel
    _OutputFilteredBuffer0[COORD_TEXTURE2D_X(centerCoord)] = float4(color0 * (1.0 - accumulationFactor) + history0 * accumulationFactor);
    _OutputFilteredBuffer1[COORD_TEXTURE2D_X(centerCoord)] = float4(color1.xyz * (1.0 - accumulationFactor) + history1.xyz * accumulationFactor, sampleCount);
}

[numthreads(SSGI_FILTER_TILE_SIZE, SSGI_FILTER_TILE_SIZE, 1)]
void COPY_HISTORY(uint3 dispatchThreadId : SV_DispatchThreadID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);

    if (any(dispatchThreadId.xy > uint2(_ScreenSize.xy)))
        return;  // Out of bounds, discard

    _OutputFilteredBuffer0[COORD_TEXTURE2D_X(dispatchThreadId.xy)] = LOAD_TEXTURE2D_X(_InputNoisyBuffer0, dispatchThreadId.xy);
    _OutputFilteredBuffer1[COORD_TEXTURE2D_X(dispatchThreadId.xy)] = LOAD_TEXTURE2D_X(_InputNoisyBuffer1, dispatchThreadId.xy);
}