#pragma kernel CSVFXIndirectArgs #include "HLSLSupport.cginc" CBUFFER_START(Uniform) uint maxNb; uint offset; CBUFFER_END #define NB_THREADS_PER_GROUP 64 struct InputData //TODO use generic name (indexCount isn't always what it expects to be) { uint type; uint indexCount; uint startIndex; uint baseVertex; }; ByteAddressBuffer inputGpuBuffer; StructuredBuffer inputCpuBuffer; RWBuffer outputIndirectArgs; RWByteAddressBuffer outputAdditional; [numthreads(NB_THREADS_PER_GROUP,1,1)] void CSVFXIndirectArgs(uint3 id : SV_DispatchThreadID) { uint index = id.x + offset; if (index < maxNb) { uint elementCount = inputGpuBuffer.Load(index << 2); InputData data = inputCpuBuffer[index]; uint4 args = (uint4)0; switch(data.type) { case 0: // point { args.x = elementCount; args.y = 1; break; } case 1: // lines { args.x = elementCount * 2; args.y = 1; break; } case 2: // quads { if (elementCount > 16384) // Due to 16bits indices (4 vertices per element) { const uint NB_PARTICLES_PER_INSTANCE = 2048; args.x = NB_PARTICLES_PER_INSTANCE * 6; args.y = (elementCount + (NB_PARTICLES_PER_INSTANCE - 1)) / NB_PARTICLES_PER_INSTANCE; } else { args.x = elementCount * 6; args.y = 1; } break; } case 3: // hexahedron { if (elementCount > 8192) // Due to 16bits indices (8 vertices per element) { const uint NB_PARTICLES_PER_INSTANCE = 1024; args.x = NB_PARTICLES_PER_INSTANCE * 36; args.y = (elementCount + (NB_PARTICLES_PER_INSTANCE - 1)) / NB_PARTICLES_PER_INSTANCE; } else { args.x = elementCount * 36; args.y = 1; } break; } case 4: // meshes { args.x = data.indexCount; args.y = elementCount; args.z = data.startIndex; args.w = data.baseVertex; break; } case 5: // triangles { args.x = elementCount * 3; args.y = 1; break; } case 6: // octagon { if (elementCount > 8192) // Due to 16bits indices (8 vertices per element) { const uint NB_PARTICLES_PER_INSTANCE = 1024; args.x = NB_PARTICLES_PER_INSTANCE * 18; args.y = (elementCount + (NB_PARTICLES_PER_INSTANCE - 1)) / NB_PARTICLES_PER_INSTANCE; } else { args.x = elementCount * 18; args.y = 1; } break; } case 0xffffffff: // dispatch { //Clamp indirect dispatch to the maximum dispatch width of DX11 : 65535 (D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION) elementCount = min(elementCount, 65535 * NB_THREADS_PER_GROUP); args.x = (elementCount + NB_THREADS_PER_GROUP - 1) / NB_THREADS_PER_GROUP; //Update total element count (spawnIndex) outputAdditional.Store((index * 2 + 0) << 2, elementCount); outputAdditional.Store((index * 2 + 1) << 2, outputAdditional.Load((data.indexCount * 2 + 1) << 2) + elementCount); args.y = args.z = 1u; break; } } uint indexOutput = index * 10; outputIndirectArgs[indexOutput + 0] = args.x; //IndexCountPerInstance or ThreadGroupCountX outputIndirectArgs[indexOutput + 1] = args.y; //InstanceCount or ThreadGroupCountY outputIndirectArgs[indexOutput + 2] = args.z; //startIndex or ThreadGroupCountz outputIndirectArgs[indexOutput + 3] = args.w; //baseVertex // next uint (StartInstance) set to 0 at initialization // XR single-pass instancing support (same data as above but instanceCount is multiplied by the number of instanced views) outputIndirectArgs[indexOutput + 5] = args.x; outputIndirectArgs[indexOutput + 6] = args.y * ((data.type != 0xffffffff) ? 2 : 1); outputIndirectArgs[indexOutput + 7] = args.z; outputIndirectArgs[indexOutput + 8] = args.w; // next uint (StartInstance) set to 0 at initialization } }