2021-09-09 20:42:29 -04:00

111 lines
3.6 KiB
Plaintext

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl"
#pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
#pragma kernel KMain
TEXTURE2D_X(_InputCoCTexture);
RW_TEXTURE2D_X(float, _OutputCoCTexture);
CBUFFER_START(cb0)
float4 _Params;
CBUFFER_END
#define Size uint2(_Params.xy)
// 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color
// channels packed together.
groupshared uint gs_cache[128];
void Store2Pixels(uint index, float pixel1, float pixel2)
{
gs_cache[index] = f32tof16(pixel1) | f32tof16(pixel2) << 16;
}
void Load2Pixels(uint index, out float pixel1, out float pixel2)
{
uint rr = gs_cache[index];
pixel1 = f16tof32(rr );
pixel2 = f16tof32(rr >> 16);
}
void Store1Pixel(uint index, float pixel)
{
gs_cache[index] = asuint(pixel);
}
void Load1Pixel(uint index, out float pixel)
{
pixel = asfloat(gs_cache[index]);
}
// Look for the max pixel in LDS horizontally two pixels at a time, this reduces LDS reads and
// pixel unpacking
void HorizontalPass(uint outIndex, uint leftMostIndex)
{
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
Load2Pixels(leftMostIndex , s0, s1);
Load2Pixels(leftMostIndex + 1, s2, s3);
Load2Pixels(leftMostIndex + 2, s4, s5);
Load2Pixels(leftMostIndex + 3, s6, s7);
Load2Pixels(leftMostIndex + 4, s8, s9);
float maxCoC1 = Max3(Max3(s0, s1, s2), Max3(s3, s4, s5), Max3(s6, s7, s8));
float maxCoC2 = Max3(Max3(s1, s2, s3), Max3(s4, s5, s6), Max3(s7, s8, s9));
Store1Pixel(outIndex, maxCoC1);
Store1Pixel(outIndex + 1, maxCoC2);
}
void VerticalPass(uint2 pixelCoord, uint topMostIndex)
{
float s0, s1, s2, s3, s4, s5, s6, s7, s8;
Load1Pixel(topMostIndex , s0);
Load1Pixel(topMostIndex + 8, s1);
Load1Pixel(topMostIndex + 16, s2);
Load1Pixel(topMostIndex + 24, s3);
Load1Pixel(topMostIndex + 32, s4);
Load1Pixel(topMostIndex + 40, s5);
Load1Pixel(topMostIndex + 48, s6);
Load1Pixel(topMostIndex + 56, s7);
Load1Pixel(topMostIndex + 64, s8);
float maxCoC = Max3(Max3(s0, s1, s2), Max3(s3, s4, s5), Max3(s6, s7, s8));
// Write to the final target
_OutputCoCTexture[COORD_TEXTURE2D_X(pixelCoord)] = maxCoC;
}
#define GROUP_SIZE 8
[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void KMain(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
// Upper-left pixel coordinate of quad that this thread will read
int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4;
uint2 uthreadUL = uint2(max(0, threadUL));
float coc00 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 0u), Size))].x;
float coc10 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 0u), Size))].x;
float coc01 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 1u), Size))].x;
float coc11 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 1u), Size))].x;
// Store the four CoCs in LDS
uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
Store2Pixels(destIdx , coc00, coc10);
Store2Pixels(destIdx + 8u, coc01, coc11);
GroupMemoryBarrierWithGroupSync();
uint row = groupThreadId.y << 4u;
HorizontalPass(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));
GroupMemoryBarrierWithGroupSync();
VerticalPass(dispatchThreadId.xy, (groupThreadId.y << 3u) + groupThreadId.x);
}