186 lines
5.5 KiB
Plaintext
186 lines
5.5 KiB
Plaintext
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/PostProcessing/Shaders/HistogramExposureCommon.hlsl"
|
|
|
|
|
|
// TODO List to investigate
|
|
// - Worth considering multiple histograms per lane in the thread. (i.e. sharedHisto[BINS][NUMB_HIST] )
|
|
// - At the moment the dispatch is at half res, but the buffer sampled is full res,
|
|
// causing fairly bad cache behaviour. Can we use the mip chain realistically without issues? [The one we have is blurred and might be incomplete?]
|
|
|
|
#pragma kernel KHistogramGen GEN_PASS
|
|
#pragma kernel KHistogramReduce REDUCE_PASS
|
|
#define GROUP_SIZE_X 16
|
|
#define GROUP_SIZE_Y 8
|
|
|
|
#pragma multi_compile _ OUTPUT_DEBUG_DATA
|
|
|
|
// Because atomics are only on uint and we need a weighted value, we need to convert.
|
|
// If we multiply the weight by 2048, we get somewhat ok precision and we support up to
|
|
// the equivalent of 1920x1080 image in one bin. (Note, we run this at half res, so equivalent of 4k image)
|
|
uint PackWeight(float weight)
|
|
{
|
|
return uint(weight * 2048);
|
|
}
|
|
|
|
groupshared uint gs_localHistogram[HISTOGRAM_BINS];
|
|
|
|
|
|
[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
|
|
void KHistogramGen(uint groupIndex : SV_GroupIndex,
|
|
uint3 dispatchThreadId : SV_DispatchThreadID)
|
|
{
|
|
// Groupshared memory is not guaranteed to be 0 initialized.
|
|
// Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
|
|
if (groupIndex < HISTOGRAM_BINS)
|
|
{
|
|
gs_localHistogram[groupIndex] = 0u;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// TODO: This leads to poor cache behaviour, verify if we can use lower mip of the color pyramid.
|
|
uint2 fullResCoords = dispatchThreadId.xy << 1u;
|
|
|
|
if (all(fullResCoords < uint2(_ScreenSize.xy)))
|
|
{
|
|
float2 uv = ClampAndScaleUVForBilinear((fullResCoords + 0.5) * _ScreenSize.zw);
|
|
float luminance = SampleLuminance(uv);
|
|
float weight = WeightSample(fullResCoords, _ScreenSize.xy, luminance);
|
|
|
|
uint bin = GetHistogramBinLocation(luminance);
|
|
InterlockedAdd(gs_localHistogram[bin], PackWeight(weight));
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Note that currently the branch is always true (GROUP_SIZE_X * GROUP_SIZE_Y == HISTOGRAM_BINS). Here as safeguard if changing group size or bins.
|
|
if (groupIndex < HISTOGRAM_BINS)
|
|
{
|
|
InterlockedAdd(_HistogramBuffer[groupIndex], gs_localHistogram[groupIndex]);
|
|
}
|
|
}
|
|
|
|
#define USE_WAVE_INTRINSICS defined(PLATFORM_LANE_COUNT) && defined(PLATFORM_SUPPORTS_WAVE_INTRINSICS)
|
|
|
|
|
|
#if USE_WAVE_INTRINSICS
|
|
|
|
#define WAVE_SIZE PLATFORM_LANE_COUNT
|
|
#define SUM_SCRATCH_SIZE HISTOGRAM_BINS / WAVE_SIZE
|
|
|
|
#else
|
|
|
|
#define SUM_SCRATCH_SIZE HISTOGRAM_BINS
|
|
|
|
#endif
|
|
|
|
groupshared float gs_partialSums[SUM_SCRATCH_SIZE];
|
|
groupshared float gs_values[HISTOGRAM_BINS];
|
|
|
|
float ComputeTotalSum(uint threadID, float threadVal)
|
|
{
|
|
float sum = 0;
|
|
|
|
#if USE_WAVE_INTRINSICS
|
|
|
|
uint waveCount = (HISTOGRAM_BINS / WAVE_SIZE);
|
|
float waveSum = WaveActiveSum(threadVal);
|
|
|
|
uint waveIDInGroup = threadID / WAVE_SIZE;
|
|
if (WaveIsFirstLane())
|
|
{
|
|
gs_partialSums[waveIDInGroup] = waveSum;
|
|
}
|
|
|
|
// We have values for all the waves, let's sync.
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
sum = gs_partialSums[0];
|
|
[unroll]
|
|
for (uint i = 1u; i < waveCount; ++i)
|
|
{
|
|
sum += gs_partialSums[i];
|
|
}
|
|
|
|
#else // !USE_WAVE_INTRINSICS
|
|
|
|
gs_partialSums[threadID] = threadVal;
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Sum all values
|
|
for (uint i = HISTOGRAM_BINS >> 1u; i > 0u; i >>= 1u)
|
|
{
|
|
if (threadID < i)
|
|
gs_partialSums[threadID] = (gs_partialSums[threadID] + gs_partialSums[threadID + i]);
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
|
|
sum = gs_partialSums[0];
|
|
|
|
#endif
|
|
|
|
return sum;
|
|
}
|
|
|
|
void ProcessBin(uint binIndex, inout float2 extremesSums, inout float evSum, inout float totalWeight)
|
|
{
|
|
float histVal = gs_values[binIndex];
|
|
float binEV = BinLocationToEV(binIndex);
|
|
|
|
// Shadows
|
|
float off = min(extremesSums.x, histVal);
|
|
extremesSums -= off;
|
|
histVal -= off;
|
|
// Highlights
|
|
histVal = min(extremesSums.y, histVal);
|
|
extremesSums.y -= histVal;
|
|
|
|
evSum += histVal * binEV;
|
|
totalWeight += histVal;
|
|
}
|
|
|
|
[numthreads(HISTOGRAM_BINS, 1, 1)]
|
|
void KHistogramReduce(uint3 dispatchThreadId : SV_DispatchThreadID)
|
|
{
|
|
uint threadID = dispatchThreadId.x;
|
|
float histogramVal = UnpackWeight(_HistogramBuffer[threadID]);
|
|
|
|
gs_values[threadID] = histogramVal;
|
|
|
|
float sum = ComputeTotalSum(threadID, histogramVal);
|
|
|
|
float2 extremesSums = float2(_HistogramMinPercentile, _HistogramMaxPercentile) * sum;
|
|
|
|
// TODO: Can we be a bit more parallel here?
|
|
if (threadID == 0)
|
|
{
|
|
float evProcessedSum = 0;
|
|
float w = 0;
|
|
|
|
for (int i = 0; i < HISTOGRAM_BINS; ++i)
|
|
{
|
|
ProcessBin(i, extremesSums, evProcessedSum, w);
|
|
}
|
|
|
|
w = max(w, 1e-4f);
|
|
float avgEV = evProcessedSum * rcp(w);
|
|
|
|
float minExposure = ParamExposureLimitMin;
|
|
float maxExposure = ParamExposureLimitMax;
|
|
if (ParamEvaluateMode == 2)
|
|
{
|
|
avgEV = CurveRemap(avgEV, minExposure, maxExposure);
|
|
}
|
|
|
|
float exposure = AdaptExposure(avgEV - ParamExposureCompensation);
|
|
exposure = clamp(exposure, minExposure, maxExposure);
|
|
_OutputTexture[uint2(0, 0)] = float2(ConvertEV100ToExposure(exposure, LensImperfectionExposureScale), exposure);
|
|
#ifdef OUTPUT_DEBUG_DATA
|
|
_ExposureDebugTexture[uint2(0, 0)] = float2(avgEV - ParamExposureCompensation, 0.0f);
|
|
#endif
|
|
}
|
|
|
|
|
|
}
|