#include "Packages/com.unity.render-pipelines.high-definition/Runtime/PostProcessing/Shaders/HistogramExposureCommon.hlsl"


// TODO List to investigate
//      - Worth considering multiple histograms per lane in the thread. (i.e. sharedHisto[BINS][NUMB_HIST] )
//      - At the moment the dispatch is at half res, but the buffer sampled is full res,
//        causing fairly bad cache behaviour. Can we use the mip chain realistically without issues? [The one we have is blurred and might be incomplete?]

#pragma kernel KHistogramGen        GEN_PASS
#pragma kernel KHistogramReduce     REDUCE_PASS
#define GROUP_SIZE_X 16
#define GROUP_SIZE_Y 8

#pragma multi_compile _ OUTPUT_DEBUG_DATA

// Because atomics are only on uint and we need a weighted value, we need to convert.
// If we multiply the weight by 2048, we get somewhat ok precision and we support up to
// the equivalent of 1920x1080 image in one bin. (Note, we run this at half res, so equivalent of 4k image)
uint PackWeight(float weight)
{
    return uint(weight * 2048);
}

groupshared uint gs_localHistogram[HISTOGRAM_BINS];


[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
void KHistogramGen(uint groupIndex : SV_GroupIndex,
                   uint3 dispatchThreadId : SV_DispatchThreadID)
{
    // Groupshared memory is not guaranteed to be 0 initialized.
    // Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
    if (groupIndex < HISTOGRAM_BINS)
    {
        gs_localHistogram[groupIndex] = 0u;
    }

    GroupMemoryBarrierWithGroupSync();

    // TODO: This leads to poor cache behaviour, verify if we can use lower mip of the color pyramid.
    uint2 fullResCoords = dispatchThreadId.xy << 1u;

    if (all(fullResCoords < uint2(_ScreenSize.xy)))
    {
        float2 uv = ClampAndScaleUVForBilinear((fullResCoords + 0.5) * _ScreenSize.zw);
        float luminance = SampleLuminance(uv);
        float weight = WeightSample(fullResCoords, _ScreenSize.xy, luminance);

        uint  bin = GetHistogramBinLocation(luminance);
        InterlockedAdd(gs_localHistogram[bin], PackWeight(weight));
    }

    GroupMemoryBarrierWithGroupSync();

    // Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
    if (groupIndex < HISTOGRAM_BINS)
    {
        InterlockedAdd(_HistogramBuffer[groupIndex], gs_localHistogram[groupIndex]);
    }
}

#define USE_WAVE_INTRINSICS     defined(PLATFORM_LANE_COUNT) && defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS)


#if USE_WAVE_INTRINSICS

#define WAVE_SIZE   PLATFORM_LANE_COUNT
#define SUM_SCRATCH_SIZE  HISTOGRAM_BINS / WAVE_SIZE

#else

#define SUM_SCRATCH_SIZE  HISTOGRAM_BINS

#endif

groupshared float gs_partialSums[SUM_SCRATCH_SIZE];
groupshared float gs_values[HISTOGRAM_BINS];

float ComputeTotalSum(uint threadID, float threadVal)
{
    float sum = 0;

#if USE_WAVE_INTRINSICS

    uint waveCount = (HISTOGRAM_BINS / WAVE_SIZE);
    float waveSum = WaveActiveSum(threadVal);

    uint waveIDInGroup = threadID / WAVE_SIZE;
    if (WaveIsFirstLane())
    {
        gs_partialSums[waveIDInGroup] = waveSum;
    }

    // We have values for all the waves, let's sync.
    GroupMemoryBarrierWithGroupSync();

    sum = gs_partialSums[0];
    [unroll]
    for (uint i = 1u; i < waveCount; ++i)
    {
        sum += gs_partialSums[i];
    }

#else // !USE_WAVE_INTRINSICS

    gs_partialSums[threadID] = threadVal;

    GroupMemoryBarrierWithGroupSync();

    // Sum all values
    for (uint i = HISTOGRAM_BINS >> 1u; i > 0u; i >>= 1u)
    {
        if (threadID < i)
            gs_partialSums[threadID] = (gs_partialSums[threadID] + gs_partialSums[threadID + i]);

        GroupMemoryBarrierWithGroupSync();
    }

    sum = gs_partialSums[0];

#endif

    return sum;
}

void ProcessBin(uint binIndex, inout float2 extremesSums, inout float evSum, inout float totalWeight)
{
    float histVal = gs_values[binIndex];
    float binEV = BinLocationToEV(binIndex);

    // Shadows
    float off = min(extremesSums.x, histVal);
    extremesSums -= off;
    histVal -= off;
    // Highlights
    histVal = min(extremesSums.y, histVal);
    extremesSums.y -= histVal;

    evSum += histVal * binEV;
    totalWeight += histVal;
}

[numthreads(HISTOGRAM_BINS, 1, 1)]
void KHistogramReduce(uint3 dispatchThreadId : SV_DispatchThreadID)
{
    uint threadID = dispatchThreadId.x;
    float histogramVal = UnpackWeight(_HistogramBuffer[threadID]);

    gs_values[threadID] = histogramVal;

    float sum = ComputeTotalSum(threadID, histogramVal);

    float2 extremesSums = float2(_HistogramMinPercentile, _HistogramMaxPercentile) * sum;

    // TODO: Can we be a bit more parallel here?
    if (threadID == 0)
    {
        float evProcessedSum = 0;
        float w = 0;

        for (int i = 0; i < HISTOGRAM_BINS; ++i)
        {
            ProcessBin(i, extremesSums, evProcessedSum, w);
        }

        w = max(w, 1e-4f);
        float avgEV = evProcessedSum * rcp(w);

        float minExposure = ParamExposureLimitMin;
        float maxExposure = ParamExposureLimitMax;
        if (ParamEvaluateMode == 2)
        {
            avgEV = CurveRemap(avgEV, minExposure, maxExposure);
        }

        float exposure = AdaptExposure(avgEV - ParamExposureCompensation);
        exposure = clamp(exposure, minExposure, maxExposure);
        _OutputTexture[uint2(0, 0)] = float2(ConvertEV100ToExposure(exposure, LensImperfectionExposureScale), exposure);
#ifdef OUTPUT_DEBUG_DATA
        _ExposureDebugTexture[uint2(0, 0)] = float2(avgEV - ParamExposureCompensation, 0.0f);
#endif
    }


}