#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl" #include "Packages/com.unity.render-pipelines.high-definition/Runtime/ShaderLibrary/ShaderVariables.hlsl" #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch #pragma kernel KMain TEXTURE2D_X(_InputCoCTexture); RW_TEXTURE2D_X(float, _OutputCoCTexture); CBUFFER_START(cb0) float4 _Params; CBUFFER_END #define Size uint2(_Params.xy) // 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color // channels packed together. groupshared uint gs_cache[128]; void Store2Pixels(uint index, float pixel1, float pixel2) { gs_cache[index] = f32tof16(pixel1) | f32tof16(pixel2) << 16; } void Load2Pixels(uint index, out float pixel1, out float pixel2) { uint rr = gs_cache[index]; pixel1 = f16tof32(rr ); pixel2 = f16tof32(rr >> 16); } void Store1Pixel(uint index, float pixel) { gs_cache[index] = asuint(pixel); } void Load1Pixel(uint index, out float pixel) { pixel = asfloat(gs_cache[index]); } // Look for the max pixel in LDS horizontally two pixels at a time, this reduces LDS reads and // pixel unpacking void HorizontalPass(uint outIndex, uint leftMostIndex) { float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; Load2Pixels(leftMostIndex , s0, s1); Load2Pixels(leftMostIndex + 1, s2, s3); Load2Pixels(leftMostIndex + 2, s4, s5); Load2Pixels(leftMostIndex + 3, s6, s7); Load2Pixels(leftMostIndex + 4, s8, s9); float maxCoC1 = Max3(Max3(s0, s1, s2), Max3(s3, s4, s5), Max3(s6, s7, s8)); float maxCoC2 = Max3(Max3(s1, s2, s3), Max3(s4, s5, s6), Max3(s7, s8, s9)); Store1Pixel(outIndex, maxCoC1); Store1Pixel(outIndex + 1, maxCoC2); } void VerticalPass(uint2 pixelCoord, uint topMostIndex) { float s0, s1, s2, s3, s4, s5, s6, s7, s8; Load1Pixel(topMostIndex , s0); Load1Pixel(topMostIndex + 8, s1); Load1Pixel(topMostIndex + 16, s2); Load1Pixel(topMostIndex + 24, s3); Load1Pixel(topMostIndex + 32, s4); Load1Pixel(topMostIndex + 40, s5); Load1Pixel(topMostIndex + 48, s6); Load1Pixel(topMostIndex + 56, s7); Load1Pixel(topMostIndex + 64, s8); float maxCoC = Max3(Max3(s0, s1, s2), Max3(s3, s4, s5), Max3(s6, s7, s8)); // Write to the final target _OutputCoCTexture[COORD_TEXTURE2D_X(pixelCoord)] = maxCoC; } #define GROUP_SIZE 8 [numthreads(GROUP_SIZE, GROUP_SIZE, 1)] void KMain(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID) { UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z); // Upper-left pixel coordinate of quad that this thread will read int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4; uint2 uthreadUL = uint2(max(0, threadUL)); float coc00 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 0u), Size))].x; float coc10 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 0u), Size))].x; float coc01 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(0u, 1u), Size))].x; float coc11 = _InputCoCTexture[COORD_TEXTURE2D_X(min(uthreadUL + uint2(1u, 1u), Size))].x; // Store the four CoCs in LDS uint destIdx = groupThreadId.x + (groupThreadId.y << 4u); Store2Pixels(destIdx , coc00, coc10); Store2Pixels(destIdx + 8u, coc01, coc11); GroupMemoryBarrierWithGroupSync(); uint row = groupThreadId.y << 4u; HorizontalPass(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u)); GroupMemoryBarrierWithGroupSync(); VerticalPass(dispatchThreadId.xy, (groupThreadId.y << 3u) + groupThreadId.x); }