2021-09-09 20:42:29 -04:00

186 lines
5.5 KiB
Plaintext

#include "Packages/com.unity.render-pipelines.high-definition/Runtime/PostProcessing/Shaders/HistogramExposureCommon.hlsl"
// TODO List to investigate
// - Worth considering multiple histograms per lane in the thread. (i.e. sharedHisto[BINS][NUMB_HIST] )
// - At the moment the dispatch is at half res, but the buffer sampled is full res,
// causing fairly bad cache behaviour. Can we use the mip chain realistically without issues? [The one we have is blurred and might be incomplete?]
#pragma kernel KHistogramGen GEN_PASS
#pragma kernel KHistogramReduce REDUCE_PASS
#define GROUP_SIZE_X 16
#define GROUP_SIZE_Y 8
#pragma multi_compile _ OUTPUT_DEBUG_DATA
// Because atomics are only on uint and we need a weighted value, we need to convert.
// If we multiply the weight by 2048, we get somewhat ok precision and we support up to
// the equivalent of 1920x1080 image in one bin. (Note, we run this at half res, so equivalent of 4k image)
uint PackWeight(float weight)
{
return uint(weight * 2048);
}
groupshared uint gs_localHistogram[HISTOGRAM_BINS];
[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
void KHistogramGen(uint groupIndex : SV_GroupIndex,
uint3 dispatchThreadId : SV_DispatchThreadID)
{
// Groupshared memory is not guaranteed to be 0 initialized.
// Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
if (groupIndex < HISTOGRAM_BINS)
{
gs_localHistogram[groupIndex] = 0u;
}
GroupMemoryBarrierWithGroupSync();
// TODO: This leads to poor cache behaviour, verify if we can use lower mip of the color pyramid.
uint2 fullResCoords = dispatchThreadId.xy << 1u;
if (all(fullResCoords < uint2(_ScreenSize.xy)))
{
float2 uv = ClampAndScaleUVForBilinear((fullResCoords + 0.5) * _ScreenSize.zw);
float luminance = SampleLuminance(uv);
float weight = WeightSample(fullResCoords, _ScreenSize.xy, luminance);
uint bin = GetHistogramBinLocation(luminance);
InterlockedAdd(gs_localHistogram[bin], PackWeight(weight));
}
GroupMemoryBarrierWithGroupSync();
// Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
if (groupIndex < HISTOGRAM_BINS)
{
InterlockedAdd(_HistogramBuffer[groupIndex], gs_localHistogram[groupIndex]);
}
}
#define USE_WAVE_INTRINSICS defined(PLATFORM_LANE_COUNT) && defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS)
#if USE_WAVE_INTRINSICS
#define WAVE_SIZE PLATFORM_LANE_COUNT
#define SUM_SCRATCH_SIZE HISTOGRAM_BINS / WAVE_SIZE
#else
#define SUM_SCRATCH_SIZE HISTOGRAM_BINS
#endif
groupshared float gs_partialSums[SUM_SCRATCH_SIZE];
groupshared float gs_values[HISTOGRAM_BINS];
float ComputeTotalSum(uint threadID, float threadVal)
{
float sum = 0;
#if USE_WAVE_INTRINSICS
uint waveCount = (HISTOGRAM_BINS / WAVE_SIZE);
float waveSum = WaveActiveSum(threadVal);
uint waveIDInGroup = threadID / WAVE_SIZE;
if (WaveIsFirstLane())
{
gs_partialSums[waveIDInGroup] = waveSum;
}
// We have values for all the waves, let's sync.
GroupMemoryBarrierWithGroupSync();
sum = gs_partialSums[0];
[unroll]
for (uint i = 1u; i < waveCount; ++i)
{
sum += gs_partialSums[i];
}
#else // !USE_WAVE_INTRINSICS
gs_partialSums[threadID] = threadVal;
GroupMemoryBarrierWithGroupSync();
// Sum all values
for (uint i = HISTOGRAM_BINS >> 1u; i > 0u; i >>= 1u)
{
if (threadID < i)
gs_partialSums[threadID] = (gs_partialSums[threadID] + gs_partialSums[threadID + i]);
GroupMemoryBarrierWithGroupSync();
}
sum = gs_partialSums[0];
#endif
return sum;
}
void ProcessBin(uint binIndex, inout float2 extremesSums, inout float evSum, inout float totalWeight)
{
float histVal = gs_values[binIndex];
float binEV = BinLocationToEV(binIndex);
// Shadows
float off = min(extremesSums.x, histVal);
extremesSums -= off;
histVal -= off;
// Highlights
histVal = min(extremesSums.y, histVal);
extremesSums.y -= histVal;
evSum += histVal * binEV;
totalWeight += histVal;
}
[numthreads(HISTOGRAM_BINS, 1, 1)]
void KHistogramReduce(uint3 dispatchThreadId : SV_DispatchThreadID)
{
uint threadID = dispatchThreadId.x;
float histogramVal = UnpackWeight(_HistogramBuffer[threadID]);
gs_values[threadID] = histogramVal;
float sum = ComputeTotalSum(threadID, histogramVal);
float2 extremesSums = float2(_HistogramMinPercentile, _HistogramMaxPercentile) * sum;
// TODO: Can we be a bit more parallel here?
if (threadID == 0)
{
float evProcessedSum = 0;
float w = 0;
for (int i = 0; i < HISTOGRAM_BINS; ++i)
{
ProcessBin(i, extremesSums, evProcessedSum, w);
}
w = max(w, 1e-4f);
float avgEV = evProcessedSum * rcp(w);
float minExposure = ParamExposureLimitMin;
float maxExposure = ParamExposureLimitMax;
if (ParamEvaluateMode == 2)
{
avgEV = CurveRemap(avgEV, minExposure, maxExposure);
}
float exposure = AdaptExposure(avgEV - ParamExposureCompensation);
exposure = clamp(exposure, minExposure, maxExposure);
_OutputTexture[uint2(0, 0)] = float2(ConvertEV100ToExposure(exposure, LensImperfectionExposureScale), exposure);
#ifdef OUTPUT_DEBUG_DATA
_ExposureDebugTexture[uint2(0, 0)] = float2(avgEV - ParamExposureCompensation, 0.0f);
#endif
}
}