From ee2c18cc998b27249bc297a22823c28ced64e230 Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 03:54:31 +0900 Subject: [PATCH 1/7] gpu vulkan processing for lighting --- include/light/light.hh | 1 + include/light/trace_embree.hh | 47 + include/light/trace_gpu.hh | 140 +++ light/CMakeLists.txt | 34 +- light/gpu_shaders/direct_phase.comp | 180 ++++ light/gpu_shaders/occlusion.comp | 59 ++ light/light.cc | 22 +- light/ltface.cc | 406 ++++++++- light/trace_gpu.cc | 233 +++++ light/trace_gpu_vulkan.cc | 1244 +++++++++++++++++++++++++++ 10 files changed, 2362 insertions(+), 4 deletions(-) create mode 100644 include/light/trace_gpu.hh create mode 100644 light/gpu_shaders/direct_phase.comp create mode 100644 light/gpu_shaders/occlusion.comp create mode 100644 light/trace_gpu.cc create mode 100644 light/trace_gpu_vulkan.cc diff --git a/include/light/light.hh b/include/light/light.hh index 0e3cb7f5..4b2852a6 100644 --- a/include/light/light.hh +++ b/include/light/light.hh @@ -396,6 +396,7 @@ public: setting_bool novanilla; setting_scalar gate; setting_int32 sunsamples; + settings::setting_bool gpu; // -gpu: use Vulkan GPU ray-query backend when available setting_bool arghradcompat; setting_bool nolighting; setting_vec3 debugface; diff --git a/include/light/trace_embree.hh b/include/light/trace_embree.hh index 3f4856c5..997bc404 100644 --- a/include/light/trace_embree.hh +++ b/include/light/trace_embree.hh @@ -18,6 +18,7 @@ */ #pragma once +#include #include #include @@ -280,6 +281,52 @@ public: if (!_rays.size()) return; +#if defined(HAVE_GPU_LIGHT) + // Optional large-batch occlusion path. v5 direct lighting uses + // direct_phase.comp; small fallback raystreams stay on Embree. + constexpr size_t GPU_OCCLUSION_MIN_BATCH = 262144; + + if (_rays.size() >= GPU_OCCLUSION_MIN_BATCH && GPU_TraceAvailable()) { + std::vector gpu_rays; + std::vector gpu_results; + + gpu_rays.resize(_rays.size()); + gpu_results.resize(_rays.size()); + + for (size_t i = 0; i < _rays.size(); ++i) { + const auto &src = _rays[i].ray.ray; + auto &dst = gpu_rays[i]; + + dst.origin[0] = src.org_x; + dst.origin[1] = src.org_y; + dst.origin[2] = src.org_z; + dst.tmin = src.tnear; + + dst.direction[0] = src.dir_x; + dst.direction[1] = src.dir_y; + dst.direction[2] = src.dir_z; + dst.tmax = src.tfar; + + dst.shadow_mask = static_cast(shadowmask); + dst.user_index = static_cast(i); + } + + if (gpu_light::trace_occlusion_batch( + self, + static_cast(shadowmask), + gpu_rays.data(), + gpu_results.data(), + gpu_rays.size())) { + for (size_t i = 0; i < _rays.size(); ++i) { + if (gpu_results[i].occluded) { + _rays[i].ray.ray.tfar = -std::abs(_rays[i].ray.ray.tfar); + } + } + return; + } + } +#endif + ray_source_info ctx2(this, self, shadowmask); RTCOccludedArguments embree4_args = ctx2.setup_occluded_arguments(); for (auto &ray : _rays) diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh new file mode 100644 index 00000000..a0ecfb51 --- /dev/null +++ b/include/light/trace_gpu.hh @@ -0,0 +1,140 @@ +/* GPU trace backend + * Prototype overlay generated for Linux/Vulkan ray-query development. + */ +#pragma once + +#include +#include +#include + +struct mbsp_t; +class modelinfo_t; + +#ifndef HAVE_GPU_LIGHT +#define GPU_LIGHT_COMPILED 0 +#else +#define GPU_LIGHT_COMPILED 1 +#endif + +namespace gpu_light { + +struct ray_t { + float origin[3] = {0, 0, 0}; + float tmin = 0.01f; + float direction[3] = {0, 0, 1}; + float tmax = 0.0f; + std::uint32_t shadow_mask = 0xffffffffu; + std::uint32_t user_index = 0; +}; + +struct occlusion_result_t { + std::uint32_t occluded = 0; + std::uint32_t reserved0 = 0; + float transmittance[3] = {1.0f, 1.0f, 1.0f}; +}; + +struct direct_job_t { + float ox = 0, oy = 0, oz = 0, tmin = 0.01f; + float dx = 0, dy = 0, dz = 1, tmax = 0.0f; + float cr = 0, cg = 0, cb = 0, pad0 = 0; + float nr = 0, ng = 0, nb = 0, pad1 = 0; + std::uint32_t sample_index = 0; + std::uint32_t flags = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; +}; + +struct direct_sample_range_t { + std::uint32_t first = 0; + std::uint32_t count = 0; +}; + +struct direct_accum_t { + float cr = 0, cg = 0, cb = 0, pad0 = 0; + float nr = 0, ng = 0, nb = 0, pad1 = 0; + std::uint32_t hit = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; + std::uint32_t reserved2 = 0; +}; + +struct direct_phase_sample_t { + float px = 0, py = 0, pz = 0, occlusion = 1; + float nx = 0, ny = 0, nz = 1, twosided = 0; +}; + +struct direct_phase_source_t { + float px = 0, py = 0, pz = 0, light = 0; + float dx = 0, dy = 0, dz = 1, dist = 65536.0f; + float cr = 1, cg = 1, cb = 1, atten = 1; + std::uint32_t type = 0; // 0 = point, 1 = sun + std::uint32_t formula = 0; // light_formula_t for point lights + std::uint32_t flags = 0; // bit 0: dirt + std::uint32_t reserved0 = 0; + float anglescale = 1; + float dirt = 0; + float falloff = 0; + float pad0 = 0; +}; + +using direct_phase_accum_t = direct_accum_t; + + +enum class backend_state_t { + unavailable, + initialized, + failed +}; + +struct stats_t { + std::uint64_t batches = 0; + std::uint64_t rays = 0; + std::uint64_t gpu_batches = 0; + std::uint64_t fallback_batches = 0; +}; + +bool requested(); +backend_state_t state(); +const char *state_string(); +const char *last_error(); +stats_t stats(); + +bool init(const mbsp_t *bsp); +void shutdown(); + +// Returns true when the batch was handled by the GPU backend. Returns false to +// tell the caller to run the existing CPU/Embree path. +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const ray_t *rays, + occlusion_result_t *results, + std::size_t count); + + +bool trace_direct_phase_batch( + const direct_phase_source_t *sources, + std::size_t source_count, + const direct_phase_sample_t *samples, + direct_phase_accum_t *accum, + std::size_t sample_count); + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const direct_job_t *jobs, + std::size_t job_count, + const direct_sample_range_t *ranges, + direct_accum_t *accum, + std::size_t sample_count); + +} // namespace gpu_light + +// C-style wrappers are easier to call from older code paths. +bool GPU_TraceInit(const mbsp_t *bsp); +void GPU_TraceShutdown(); +bool GPU_TraceAvailable(); +const char *GPU_TraceLastError(); + +// Implemented in light/ltface.cc by the v5 overlay; flushes pending sample-driven direct-light work. +void GPU_DirectQueue_Flush(const mbsp_t *bsp); diff --git a/light/CMakeLists.txt b/light/CMakeLists.txt index 030bde65..73ffcda6 100644 --- a/light/CMakeLists.txt +++ b/light/CMakeLists.txt @@ -1,3 +1,4 @@ +option(LIGHT_ENABLE_VULKAN_GPU "Enable Vulkan GPU ray-query backend for light" OFF) option(SKIP_TBB_INSTALL "Skip TBB Library Installation" OFF) option(SKIP_EMBREE_INSTALL "Skip Embree Library Installation" OFF) @@ -9,7 +10,7 @@ set(LIGHT_INCLUDES ../include/light/bounce.hh ../include/light/surflight.hh ../include/light/ltface.hh - ../include/light/trace.hh + ../include/light/trace.hh ../include/light/trace_gpu.hh ../include/light/write.hh ../include/light/spatialindex.hh ) @@ -47,9 +48,40 @@ endif(embree_FOUND) add_library(liblight STATIC ${LIGHT_SOURCES}) +if (LIGHT_ENABLE_VULKAN_GPU) + find_package(Vulkan REQUIRED) + find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED) + + target_sources(liblight PRIVATE + trace_gpu.cc + trace_gpu_vulkan.cc + ) + target_compile_definitions(liblight PRIVATE HAVE_GPU_LIGHT=1) + target_link_libraries(liblight PRIVATE Vulkan::Vulkan) + + set(GPU_SHADER_SPVS) + foreach(GPU_SHADER_NAME occlusion direct_phase) + set(GPU_SHADER_SRC "${CMAKE_CURRENT_SOURCE_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp") + set(GPU_SHADER_SPV "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp.spv") + add_custom_command( + OUTPUT "${GPU_SHADER_SPV}" + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders" + COMMAND "${GLSLANG_VALIDATOR}" -V "${GPU_SHADER_SRC}" -o "${GPU_SHADER_SPV}" + DEPENDS "${GPU_SHADER_SRC}" + VERBATIM) + list(APPEND GPU_SHADER_SPVS "${GPU_SHADER_SPV}") + endforeach() + add_custom_target(light_gpu_shaders DEPENDS ${GPU_SHADER_SPVS}) + add_dependencies(liblight light_gpu_shaders) +endif() + target_link_libraries(liblight PRIVATE common ${CMAKE_THREAD_LIBS_INIT} fmt::fmt jsoncpp_static) add_executable(light main.cc) +if (LIGHT_ENABLE_VULKAN_GPU) + add_dependencies(light light_gpu_shaders) +endif() + target_link_libraries(light PRIVATE common liblight) if (embree_FOUND) diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp new file mode 100644 index 00000000..663ba73f --- /dev/null +++ b/light/gpu_shaders/direct_phase.comp @@ -0,0 +1,180 @@ +#version 460 +#extension GL_EXT_ray_query : require +#extension GL_EXT_scalar_block_layout : require + +layout(local_size_x = 64) in; + +struct GpuDirectPhaseSample { + vec3 position; + float occlusion; + vec3 normal; + float twosided; +}; + +struct GpuDirectPhaseSource { + vec3 position; + float light; + vec3 direction; + float dist; + vec3 color; + float atten; + uint type; // 0 = point, 1 = sun + uint formula; // light_formula_t for point lights + uint flags; // bit 0 = dirt + uint reserved0; + float anglescale; + float dirt; + float falloff; + float pad0; +}; + +struct GpuDirectAccum { + vec3 color; + float pad0; + vec3 normal; + float pad1; + uint hit; + uint reserved0; + uint reserved1; + uint reserved2; +}; + +layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS; +layout(std430, set = 0, binding = 1) readonly buffer Samples { GpuDirectPhaseSample samples[]; } sampleBuffer; +layout(std430, set = 0, binding = 2) readonly buffer Sources { GpuDirectPhaseSource sources[]; } sourceBuffer; +layout(std430, set = 0, binding = 3) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer; + +layout(push_constant) uniform PushConstants { + uint sampleCount; + uint sourceCount; + uint flags; + uint reserved0; +} pc; + +bool occluded(vec3 origin, vec3 dir, float tmax) { + rayQueryEXT rq; + rayQueryInitializeEXT( + rq, + sceneAS, + gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT, + 0xff, + origin, + 0.01, + dir, + max(tmax, 0.02)); + + while (rayQueryProceedEXT(rq)) {} + + return rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; +} + +float point_light_value(uint formula, float light, float atten, float dist, float falloff) { + float d = max(dist, 1.0); + float a = max(atten, 0.0001); + + // Mirrors the broad ericw-tools delay/formula families well enough for the + // experimental GPU fast path. Exact exotic cases should stay on CPU. + if (formula == 1u) { // LF_INVERSE + return light * 128.0 / (d * a); + } else if (formula == 2u) { // LF_INVERSE2 + return light * 128.0 * 128.0 / (d * d * a); + } else if (formula == 3u) { // LF_INFINITE + return light; + } else if (formula == 5u) { // LF_INVERSE2A + float da = d + 128.0; + return light * 128.0 * 128.0 / (da * da * a); + } else if (formula == 6u) { // LF_QRAD3-ish + float qd = max(d, 16.0); + return light * 128.0 * 128.0 / (qd * qd * a); + } + + // LF_LINEAR. If _falloff is set, use it as the zero point. Otherwise the + // classic formula is light - distance * attenuation. + if (falloff > 0.0) { + return light * max(0.0, 1.0 - d / falloff); + } + return light - d * a; +} + +void main() { + uint sample_id = gl_GlobalInvocationID.x; + if (sample_id >= pc.sampleCount) { + return; + } + + GpuDirectPhaseSample s = sampleBuffer.samples[sample_id]; + if (s.twosided < -0.5) { + accumBuffer.accum[sample_id].color = vec3(0.0); + accumBuffer.accum[sample_id].pad0 = 0.0; + accumBuffer.accum[sample_id].normal = vec3(0.0); + accumBuffer.accum[sample_id].pad1 = 0.0; + accumBuffer.accum[sample_id].hit = 0u; + accumBuffer.accum[sample_id].reserved0 = 0u; + accumBuffer.accum[sample_id].reserved1 = 0u; + accumBuffer.accum[sample_id].reserved2 = 0u; + return; + } + vec3 total_color = vec3(0.0); + vec3 total_normal = vec3(0.0); + uint any_hit = 0u; + + for (uint source_id = 0u; source_id < pc.sourceCount; ++source_id) { + GpuDirectPhaseSource l = sourceBuffer.sources[source_id]; + + vec3 ray_dir; + float ray_dist; + float value; + vec3 ncontrib_dir; + + if (l.type == 1u) { + ray_dir = normalize(l.direction); + ray_dist = l.dist; + float angle = dot(ray_dir, s.normal); + if (s.twosided > 0.5 && angle < 0.0) angle = -angle; + angle = max(0.0, angle); + angle = (1.0 - l.anglescale) + l.anglescale * angle; + value = l.light * angle; + ncontrib_dir = ray_dir; + } else { + vec3 to_light = l.position - s.position; + ray_dist = length(to_light); + if (ray_dist <= 0.01) { + continue; + } + ray_dir = to_light / ray_dist; + float angle = dot(ray_dir, s.normal); + if (s.twosided > 0.5 && angle < 0.0) angle = -angle; + if (angle <= 0.0) { + continue; + } + angle = (1.0 - l.anglescale) + l.anglescale * max(0.0, angle); + value = point_light_value(l.formula, l.light, l.atten, ray_dist, l.falloff) * angle; + ncontrib_dir = ray_dir; + } + + if (value <= 0.0) { + continue; + } + + float dirt_scale = ((l.flags & 1u) != 0u) ? clamp(s.occlusion, 0.0, 1.0) : 1.0; + value *= dirt_scale; + if (value <= 0.0) { + continue; + } + + if (!occluded(s.position, ray_dir, ray_dist)) { + total_color += l.color * (value / 255.0); + total_normal += ncontrib_dir * value; + any_hit = 1u; + } + } + + accumBuffer.accum[sample_id].color = total_color; + accumBuffer.accum[sample_id].pad0 = 0.0; + accumBuffer.accum[sample_id].normal = total_normal; + accumBuffer.accum[sample_id].pad1 = 0.0; + accumBuffer.accum[sample_id].hit = any_hit; + accumBuffer.accum[sample_id].reserved0 = 0u; + accumBuffer.accum[sample_id].reserved1 = 0u; + accumBuffer.accum[sample_id].reserved2 = 0u; +} diff --git a/light/gpu_shaders/occlusion.comp b/light/gpu_shaders/occlusion.comp new file mode 100644 index 00000000..d60d4fe8 --- /dev/null +++ b/light/gpu_shaders/occlusion.comp @@ -0,0 +1,59 @@ +#version 460 +#extension GL_EXT_ray_query : require +#extension GL_EXT_scalar_block_layout : require + +layout(local_size_x = 128) in; + +struct GpuRay { + float ox; float oy; float oz; float tmin; + float dx; float dy; float dz; float tmax; + uint shadowMask; + uint userIndex; +}; + +struct GpuOcclusionResult { + uint occluded; + uint reserved0; + float tr; + float tg; + float tb; +}; + +layout(push_constant) uniform PushConstants { + uint rayCount; + uint flags; +} pc; + +layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS; +layout(scalar, set = 0, binding = 1) readonly buffer RayBuffer { GpuRay rays[]; } rayBuffer; +layout(scalar, set = 0, binding = 2) writeonly buffer ResultBuffer { GpuOcclusionResult results[]; } resultBuffer; + +void main() { + uint i = gl_GlobalInvocationID.x; + if (i >= pc.rayCount) { + return; + } + + GpuRay r = rayBuffer.rays[i]; + + rayQueryEXT rq; + rayQueryInitializeEXT( + rq, + sceneAS, + gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT, + 0xff, + vec3(r.ox, r.oy, r.oz), + r.tmin, + normalize(vec3(r.dx, r.dy, r.dz)), + r.tmax); + + while (rayQueryProceedEXT(rq)) { + } + + bool hit = rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; + resultBuffer.results[i].occluded = hit ? 1u : 0u; + resultBuffer.results[i].reserved0 = 0u; + resultBuffer.results[i].tr = 1.0; + resultBuffer.results[i].tg = 1.0; + resultBuffer.results[i].tb = 1.0; +} diff --git a/light/light.cc b/light/light.cc index a065f820..55ff4709 100644 --- a/light/light.cc +++ b/light/light.cc @@ -31,6 +31,7 @@ #include #include // for facesup_t #include +#include #include #include @@ -291,7 +292,7 @@ light_settings::light_settings() write_normals{this, "wrnormals", false, &output_group, "output normals, tangents and bitangents in a BSPX lump"}, novanilla{this, "novanilla", false, &experimental_group, "implies -bspxlit; don't write vanilla lighting"}, gate{this, "gate", LIGHT_EQUAL_EPSILON, &performance_group, "cutoff lights at this brightness level"}, - sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, + sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"}, arghradcompat{this, "arghradcompat", false, &output_group, "enable compatibility for Arghrad-specific keys"}, nolighting{this, "nolighting", false, &output_group, "don't output main world lighting (Q2RTX)"}, debugface{this, "debugface", std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), @@ -1339,6 +1340,16 @@ int light_main(int argc, const char **argv) FindDebugVert(&bsp); Embree_TraceInit(&bsp); +#if defined(HAVE_GPU_LIGHT) + if (light_options.gpu.value()) { + if (!GPU_TraceInit(&bsp)) { + logging::print("WARNING: -gpu requested, but GPU trace init failed: {}\n", GPU_TraceLastError()); + } else { + logging::print("GPU light tracing enabled.\n"); + } + } +#endif + if (light_options.debugmode == debugmodes::phong_obj) { CalculateVertexNormals(&bsp); @@ -1409,7 +1420,14 @@ int light_main(int argc, const char **argv) logging::print("{} empty lightmaps\n", static_cast(fully_transparent_lightmaps)); logging::close(); - return 0; + +#if defined(HAVE_GPU_LIGHT) + if (light_options.gpu.value()) { + GPU_DirectQueue_Flush(&bsp); + GPU_TraceShutdown(); + } +#endif +return 0; } int light_main(const std::vector &args) diff --git a/light/ltface.cc b/light/ltface.cc index 4634e54e..fc7c0197 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -18,6 +18,10 @@ */ #include +#include +#include +#include +#include #include #include @@ -2556,6 +2560,400 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const return Lightsurf_Init(modelinfo, cfg, face, bsp, facesup, facesup_decoupled); } + + +#if defined(HAVE_GPU_LIGHT) +static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps) +{ + // v5 disabled: per-face GPU direct was slower than Embree. + return false; + + // v4 disabled: per-face GPU direct was slower than Embree. + return false; + + // Disabled: this per-face GPU direct path is currently slower than Embree. + // It is not the final whole-phase batching architecture. + return false; + + if (!GPU_TraceAvailable()) { + return false; + } + + constexpr std::size_t GPU_DIRECT_MIN_JOBS = 32768; + + const settings::worldspawn_keys &cfg = *lightsurf->cfg; + const modelinfo_t *modelinfo = lightsurf->modelinfo; + const qplane3f &plane = lightsurf->plane; + const std::size_t sample_count = lightsurf->samples.size(); + if (!sample_count) { + return true; + } + + std::vector> per_sample(sample_count); + + auto add_job = [&](int sample_index, const qvec3f &origin, const qvec3f &direction, float dist, const qvec3f &color, const qvec3f &normalcontrib) { + gpu_light::direct_job_t job{}; + job.ox = origin[0]; + job.oy = origin[1]; + job.oz = origin[2]; + job.tmin = 0.01f; + job.dx = direction[0]; + job.dy = direction[1]; + job.dz = direction[2]; + job.tmax = dist; + job.cr = color[0]; + job.cg = color[1]; + job.cb = color[2]; + job.nr = normalcontrib[0]; + job.ng = normalcontrib[1]; + job.nb = normalcontrib[2]; + job.sample_index = static_cast(sample_index); + per_sample[static_cast(sample_index)].push_back(job); + }; + + // Entity lights. This fast path is deliberately style-0/default-channel only. + for (const auto &entity_ptr : GetLights()) { + const light_t *entity = entity_ptr.get(); + if (entity->getFormula() == LF_LOCALMIN) continue; + if (entity->nostaticlight.value()) continue; + if (entity->light.value() <= 0) continue; + + if (entity->style.value() != 0) return false; + if (entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false; + if (entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false; + + if (light_options.visapprox.value() == visapprox_t::VIS && + entity->light_channel_mask.value() == CHANNEL_MASK_DEFAULT && + entity->shadow_channel_mask.value() == CHANNEL_MASK_DEFAULT && + VisCullEntity(bsp, lightsurf->pvs, entity->leaf)) { + continue; + } + + const float planedist = plane.distance_to(entity->origin.value()); + if (planedist < 0 && !entity->bleed.value() && !lightsurf->curved && !lightsurf->twosided) { + continue; + } + if (CullLight(entity, lightsurf)) { + continue; + } + if (!(entity->light_channel_mask.value() & lightsurf->object_channel_mask)) { + continue; + } + + for (int i = 0; i < static_cast(lightsurf->samples.size()); i++) { + const auto &sample = lightsurf->samples[i]; + if (sample.occluded) continue; + + const qvec3f &surfpoint = sample.point; + const qvec3f &surfnorm = sample.normal; + qvec3f surfpointToLightDir; + float surfpointToLightDist; + qvec3f color; + qvec3f normalcontrib; + GetLightContrib(cfg, entity, surfnorm, true, surfpoint, lightsurf->twosided, color, surfpointToLightDir, normalcontrib, &surfpointToLightDist); + const float occlusion = Dirt_GetScaleFactor(cfg, sample.occlusion, entity, surfpointToLightDist, lightsurf); + color *= occlusion; + if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) { + continue; + } + add_job(i, surfpoint, surfpointToLightDir, surfpointToLightDist, color, normalcontrib); + } + } + + // Sunlight. The GPU AS contains opaque solids only, so a miss is treated as visible sky. + // Sun texture filtering and non-zero styles stay on the CPU path. + for (const sun_t &sun : GetSuns()) { + if (sun.sunlight <= 0) continue; + if (sun.style != 0) return false; + if (sun.suntexture_value) return false; + + qvec3f incoming = qv::normalize(sun.sunvec); + const float dp = qv::dot(incoming, plane.normal); + if (dp < -LIGHT_ANGLE_EPSILON && !lightsurf->curved && !lightsurf->twosided) { + continue; + } + if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) { + continue; + } + + for (int i = 0; i < static_cast(lightsurf->samples.size()); i++) { + const auto &sample = lightsurf->samples[i]; + if (sample.occluded) continue; + + const qvec3f &surfpoint = sample.point; + const qvec3f &surfnorm = sample.normal; + float angle = qv::dot(incoming, surfnorm); + if (lightsurf->twosided && angle < 0) { + angle = -angle; + } + angle = std::max(0.0f, angle); + angle = (1.0f - sun.anglescale) + sun.anglescale * angle; + float value = angle * sun.sunlight; + if (sun.dirt) { + value *= Dirt_GetScaleFactor(cfg, sample.occlusion, NULL, 0.0f, lightsurf); + } + qvec3f color = sun.sunlight_color * (value / 255.0f); + if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) { + continue; + } + qvec3f normalcontrib = incoming * value; + add_job(i, surfpoint, incoming, MAX_SKY_DIST, color, normalcontrib); + } + } + + std::size_t job_count = 0; + for (const auto &v : per_sample) { + job_count += v.size(); + } + if (job_count == 0) { + return true; + } + if (job_count < GPU_DIRECT_MIN_JOBS) { + return false; + } + + std::vector jobs; + std::vector ranges(sample_count); + jobs.reserve(job_count); + for (std::size_t i = 0; i < sample_count; ++i) { + ranges[i].first = static_cast(jobs.size()); + ranges[i].count = static_cast(per_sample[i].size()); + jobs.insert(jobs.end(), per_sample[i].begin(), per_sample[i].end()); + } + + std::vector accum(sample_count); + if (!gpu_light::trace_direct_accumulate_batch( + modelinfo, + CHANNEL_MASK_DEFAULT, + jobs.data(), + jobs.size(), + ranges.data(), + accum.data(), + sample_count)) { + return false; + } + + lightmap_t *lightmap = Lightmap_ForStyle(lightmaps, 0, lightsurf); + bool hit = false; + for (std::size_t i = 0; i < sample_count; ++i) { + if (!accum[i].hit) continue; + const qvec3f color{accum[i].cr, accum[i].cg, accum[i].cb}; + const qvec3f normalcontrib{accum[i].nr, accum[i].ng, accum[i].nb}; + lightsample_t &sample = lightmap->samples[i]; + sample.color += color; + sample.direction += normalcontrib; + lightmap->bounce_color += color; + hit = true; + } + if (hit) { + Lightmap_Save(bsp, lightmaps, lightsurf, lightmap, 0); + } + return true; +} +#endif + + + + + + +#if defined(HAVE_GPU_LIGHT) +namespace { +struct gpu_direct_face_record_t { + lightsurf_t *lightsurf = nullptr; + lightmapdict_t *lightmaps = nullptr; + std::size_t first_sample = 0; + std::size_t sample_count = 0; +}; + +std::mutex g_gpu_direct_queue_mutex; +std::vector g_gpu_direct_samples; +std::vector g_gpu_direct_sources; +std::vector g_gpu_direct_faces; +bool g_gpu_direct_sources_built = false; +bool g_gpu_direct_disabled = false; + +static constexpr std::size_t GPU_DIRECT_FLUSH_SAMPLES = 1024ull * 1024ull; + +static bool GPU_DirectQueue_BuildSourcesLocked() +{ + if (g_gpu_direct_sources_built) { + return !g_gpu_direct_disabled; + } + g_gpu_direct_sources_built = true; + g_gpu_direct_sources.clear(); + + for (const auto &entity_ptr : GetLights()) { + const light_t *entity = entity_ptr.get(); + if (entity->nostaticlight.value()) continue; + if (entity->light.value() <= 0) continue; + if (entity->sun.value()) continue; + + if (entity->style.value() != 0 || + entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT || + entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT || + entity->spotlight || entity->projectedmip || + entity->getFormula() == LF_LOCALMIN) { + logging::print("GPU direct phase: unsupported entity light encountered; falling back to CPU direct path.\n"); + g_gpu_direct_disabled = true; + return false; + } + + gpu_light::direct_phase_source_t src{}; + const qvec3f origin = entity->origin.value(); + const qvec3f color = entity->color.value(); + src.px = origin[0]; src.py = origin[1]; src.pz = origin[2]; + src.light = entity->light.value(); + src.dx = 0; src.dy = 0; src.dz = 1; src.dist = 0; + src.cr = color[0]; src.cg = color[1]; src.cb = color[2]; + src.atten = entity->atten.value(); + src.type = 0; + src.formula = static_cast(entity->getFormula()); + src.flags = entity->dirt.value() ? 1u : 0u; + src.anglescale = entity->anglescale.value(); + src.dirt = entity->dirt.value(); + src.falloff = entity->falloff.value(); + g_gpu_direct_sources.push_back(src); + } + + for (const sun_t &sun : GetSuns()) { + if (sun.sunlight <= 0) continue; + if (sun.style != 0 || sun.suntexture_value) { + logging::print("GPU direct phase: unsupported sun style/texture encountered; falling back to CPU direct path.\n"); + g_gpu_direct_disabled = true; + return false; + } + qvec3f incoming = qv::normalize(sun.sunvec); + gpu_light::direct_phase_source_t src{}; + src.type = 1; + src.dx = incoming[0]; src.dy = incoming[1]; src.dz = incoming[2]; + src.dist = MAX_SKY_DIST; + src.light = sun.sunlight; + src.cr = sun.sunlight_color[0]; src.cg = sun.sunlight_color[1]; src.cb = sun.sunlight_color[2]; + src.atten = 1; + src.formula = 0; + src.flags = sun.dirt ? 1u : 0u; + src.anglescale = sun.anglescale; + src.dirt = sun.dirt ? 1.0f : 0.0f; + g_gpu_direct_sources.push_back(src); + } + + logging::print("GPU direct phase: queued {} compatible direct sources.\n", g_gpu_direct_sources.size()); + if (g_gpu_direct_sources.empty()) { + return true; + } + return true; +} + +static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) +{ + if (g_gpu_direct_samples.empty()) { + g_gpu_direct_faces.clear(); + return true; + } + + const auto t0 = std::chrono::steady_clock::now(); + std::vector accum(g_gpu_direct_samples.size()); + const bool ok = gpu_light::trace_direct_phase_batch( + g_gpu_direct_sources.data(), + g_gpu_direct_sources.size(), + g_gpu_direct_samples.data(), + accum.data(), + g_gpu_direct_samples.size()); + const auto t1 = std::chrono::steady_clock::now(); + const double gpu_ms = std::chrono::duration_cast(t1 - t0).count() / 1000.0; + + if (!ok) { + g_gpu_direct_disabled = true; + logging::print("ERROR: GPU direct phase dispatch failed: {}\n", GPU_TraceLastError()); + logging::print("ERROR: disabling GPU direct phase for the rest of this run. Re-run without -gpu for guaranteed CPU output.\n"); + g_gpu_direct_samples.clear(); + g_gpu_direct_faces.clear(); + return false; + } + + for (const auto &rec : g_gpu_direct_faces) { + if (!rec.lightsurf || !rec.lightmaps || rec.sample_count == 0) { + continue; + } + lightmap_t *lightmap = Lightmap_ForStyle(rec.lightmaps, 0, rec.lightsurf); + bool hit = false; + for (std::size_t i = 0; i < rec.sample_count; ++i) { + const std::size_t gi = rec.first_sample + i; + if (!accum[gi].hit) continue; + const qvec3f color{accum[gi].cr, accum[gi].cg, accum[gi].cb}; + const qvec3f normalcontrib{accum[gi].nr, accum[gi].ng, accum[gi].nb}; + lightsample_t &sample = lightmap->samples[i]; + sample.color += color; + sample.direction += normalcontrib; + lightmap->bounce_color += color; + hit = true; + } + if (hit) { + Lightmap_Save(bsp, rec.lightmaps, rec.lightsurf, lightmap, 0); + } + } + + const std::uint64_t implicit_rays = static_cast(g_gpu_direct_samples.size()) * static_cast(g_gpu_direct_sources.size()); + logging::print("GPU direct phase: flushed {} samples x {} sources = {} implicit rays in {:.3f} ms\n", + g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), implicit_rays, gpu_ms); + + g_gpu_direct_samples.clear(); + g_gpu_direct_faces.clear(); + return true; +} +} // namespace + +void GPU_DirectQueue_Flush(const mbsp_t *bsp) +{ + std::lock_guard lock(g_gpu_direct_queue_mutex); + GPU_DirectQueue_FlushLocked(bsp); +} + +static bool GPU_DirectQueue_AddFace(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps) +{ + if (!GPU_TraceAvailable() || g_gpu_direct_disabled || !lightsurf || !lightmaps) { + return false; + } + if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) { + return true; + } + const std::size_t sample_count = lightsurf->samples.size(); + if (!sample_count) { + return true; + } + + std::lock_guard lock(g_gpu_direct_queue_mutex); + if (!GPU_DirectQueue_BuildSourcesLocked()) { + return false; + } + if (g_gpu_direct_sources.empty()) { + return true; + } + + const std::size_t first_sample = g_gpu_direct_samples.size(); + g_gpu_direct_faces.push_back(gpu_direct_face_record_t{lightsurf, lightmaps, first_sample, sample_count}); + + for (const auto &sample : lightsurf->samples) { + gpu_light::direct_phase_sample_t s{}; + if (!sample.occluded) { + s.px = sample.point[0]; s.py = sample.point[1]; s.pz = sample.point[2]; + s.nx = sample.normal[0]; s.ny = sample.normal[1]; s.nz = sample.normal[2]; + s.occlusion = sample.occlusion; + s.twosided = lightsurf->twosided ? 1.0f : 0.0f; + } else { + s.twosided = -1.0f; // sentinel: shader skips occluded/invalid samples + } + g_gpu_direct_samples.push_back(s); + } + + if (g_gpu_direct_samples.size() >= GPU_DIRECT_FLUSH_SAMPLES) { + GPU_DirectQueue_FlushLocked(bsp); + } + return true; +} +#endif + /* * ============ * LightFace @@ -2587,7 +2985,10 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings:: /* positive lights */ if (!(modelinfo->lightignore.value() || extended_flags.light_ignore)) { - for (const auto &entity : GetLights()) { + #if defined(HAVE_GPU_LIGHT) + if (!GPU_DirectQueue_AddFace(bsp, &lightsurf, lightmaps)) { +#endif +for (const auto &entity : GetLights()) { if (entity->getFormula() == LF_LOCALMIN) continue; if (entity->nostaticlight.value()) @@ -2598,6 +2999,9 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings:: for (const sun_t &sun : GetSuns()) if (sun.sunlight > 0) LightFace_Sky(bsp, &sun, &lightsurf, lightmaps); +#if defined(HAVE_GPU_LIGHT) + } +#endif // mxd. Add surface lights... // FIXME: negative surface lights diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc new file mode 100644 index 00000000..e20a33fd --- /dev/null +++ b/light/trace_gpu.cc @@ -0,0 +1,233 @@ +#include + +#include +#include +#include + +#if defined(HAVE_GPU_LIGHT) +namespace gpu_light::vulkan_backend { +bool init(const mbsp_t *bsp, std::string &error); +void shutdown(); +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::ray_t *rays, + gpu_light::occlusion_result_t *results, + std::size_t count, + std::string &error); + +bool trace_direct_phase_batch( + const gpu_light::direct_phase_source_t *sources, + std::size_t source_count, + const gpu_light::direct_phase_sample_t *samples, + gpu_light::direct_phase_accum_t *accum, + std::size_t sample_count, + std::string &error); + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::direct_job_t *jobs, + std::size_t job_count, + const gpu_light::direct_sample_range_t *ranges, + gpu_light::direct_accum_t *accum, + std::size_t sample_count, + std::string &error); +} // namespace gpu_light::vulkan_backend +#endif + +namespace gpu_light { +namespace { +std::mutex g_mutex; +backend_state_t g_state = backend_state_t::unavailable; +std::string g_last_error; +stats_t g_stats; +} // namespace + +bool requested() { + // The apply script wires this to light_options.gpu in the call site. Keeping + // this function independent avoids pulling all light settings into this TU. + return true; +} + +backend_state_t state() { + std::lock_guard lock(g_mutex); + return g_state; +} + +const char *state_string() { + switch (state()) { + case backend_state_t::unavailable: return "unavailable"; + case backend_state_t::initialized: return "initialized"; + case backend_state_t::failed: return "failed"; + } + return "unknown"; +} + +const char *last_error() { + std::lock_guard lock(g_mutex); + return g_last_error.c_str(); +} + +stats_t stats() { + std::lock_guard lock(g_mutex); + return g_stats; +} + +bool init(const mbsp_t *bsp) { + std::lock_guard lock(g_mutex); +#if defined(HAVE_GPU_LIGHT) + g_last_error.clear(); + if (vulkan_backend::init(bsp, g_last_error)) { + g_state = backend_state_t::initialized; + return true; + } + g_state = backend_state_t::failed; + return false; +#else + (void)bsp; + g_last_error = "light was built without LIGHT_ENABLE_VULKAN_GPU=ON"; + g_state = backend_state_t::unavailable; + return false; +#endif +} + +void shutdown() { + std::lock_guard lock(g_mutex); +#if defined(HAVE_GPU_LIGHT) + vulkan_backend::shutdown(); +#endif + g_state = backend_state_t::unavailable; +} + +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const ray_t *rays, + occlusion_result_t *results, + std::size_t count) { + if (!rays || !results || count == 0) { + return true; + } + + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += count; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_occlusion_batch(self, shadow_mask, rays, results, count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + (void)self; + (void)shadow_mask; + return false; +#endif +} + + +bool trace_direct_phase_batch( + const direct_phase_source_t *sources, + std::size_t source_count, + const direct_phase_sample_t *samples, + direct_phase_accum_t *accum, + std::size_t sample_count) { + if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) { + return true; + } + + const std::uint64_t implicit_rays = static_cast(source_count) * static_cast(sample_count); + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += implicit_rays; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_direct_phase_batch( + sources, source_count, samples, accum, sample_count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + return false; +#endif +} + + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const direct_job_t *jobs, + std::size_t job_count, + const direct_sample_range_t *ranges, + direct_accum_t *accum, + std::size_t sample_count) { + if (!jobs || !ranges || !accum || job_count == 0 || sample_count == 0) { + return true; + } + + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += job_count; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_direct_accumulate_batch( + self, shadow_mask, jobs, job_count, ranges, accum, sample_count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + (void)self; + (void)shadow_mask; + return false; +#endif +} + +} // namespace gpu_light + +bool GPU_TraceInit(const mbsp_t *bsp) { return gpu_light::init(bsp); } +void GPU_TraceShutdown() { gpu_light::shutdown(); } +bool GPU_TraceAvailable() { return gpu_light::state() == gpu_light::backend_state_t::initialized; } +const char *GPU_TraceLastError() { return gpu_light::last_error(); } diff --git a/light/trace_gpu_vulkan.cc b/light/trace_gpu_vulkan.cc new file mode 100644 index 00000000..eda4d928 --- /dev/null +++ b/light/trace_gpu_vulkan.cc @@ -0,0 +1,1244 @@ +#include + +#if defined(HAVE_GPU_LIGHT) + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#include +#endif + +namespace gpu_light::vulkan_backend { +namespace { + +struct buffer_t { + VkBuffer buffer = VK_NULL_HANDLE; + VkDeviceMemory memory = VK_NULL_HANDLE; + VkDeviceSize size = 0; +}; + +struct as_t { + VkAccelerationStructureKHR as = VK_NULL_HANDLE; + buffer_t storage; + VkDeviceAddress address = 0; +}; + +struct vertex_t { + float x, y, z; +}; + +struct gpu_ray_host_t { + float ox, oy, oz, tmin; + float dx, dy, dz, tmax; + std::uint32_t shadow_mask; + std::uint32_t user_index; +}; + +struct gpu_result_host_t { + std::uint32_t occluded; + std::uint32_t reserved0; + float tr, tg, tb; +}; + +struct gpu_direct_job_host_t { + float ox, oy, oz, tmin; + float dx, dy, dz, tmax; + float cr, cg, cb, pad0; + float nr, ng, nb, pad1; + std::uint32_t sample_index; + std::uint32_t flags; + std::uint32_t reserved0; + std::uint32_t reserved1; +}; + +struct gpu_direct_range_host_t { + std::uint32_t first; + std::uint32_t count; +}; + +struct gpu_direct_accum_host_t { + float cr, cg, cb, pad0; + float nr, ng, nb, pad1; + std::uint32_t hit; + std::uint32_t reserved0; + std::uint32_t reserved1; + std::uint32_t reserved2; +}; + + + +struct gpu_direct_phase_sample_host_t { + float px, py, pz, occlusion; + float nx, ny, nz, twosided; +}; + +struct gpu_direct_phase_source_host_t { + float px, py, pz, light; + float dx, dy, dz, dist; + float cr, cg, cb, atten; + std::uint32_t type; + std::uint32_t formula; + std::uint32_t flags; + std::uint32_t reserved0; + float anglescale; + float dirt; + float falloff; + float pad0; +}; +struct push_constants_t { + std::uint32_t ray_count; + std::uint32_t flags; +}; + +struct direct_push_constants_t { + std::uint32_t sample_count; + std::uint32_t source_count; + std::uint32_t flags; + std::uint32_t reserved0; +}; + +static_assert(sizeof(gpu_ray_host_t) == 40, "GPU ray layout must match shader"); +static_assert(sizeof(gpu_result_host_t) == 20, "GPU result layout must match shader"); +static_assert(sizeof(gpu_direct_job_host_t) == 80, "GPU direct job layout must match shader"); +static_assert(sizeof(gpu_direct_range_host_t) == 8, "GPU direct range layout must match shader"); +static_assert(sizeof(gpu_direct_accum_host_t) == 48, "GPU direct accum layout must match shader"); +static_assert(sizeof(gpu_direct_phase_sample_host_t) == 32, "GPU direct phase sample layout must match shader"); +static_assert(sizeof(gpu_direct_phase_source_host_t) == 80, "GPU direct phase source layout must match shader"); + +struct context_t { + VkInstance instance = VK_NULL_HANDLE; + VkPhysicalDevice physical = VK_NULL_HANDLE; + VkDevice device = VK_NULL_HANDLE; + VkQueue queue = VK_NULL_HANDLE; + std::uint32_t queue_family = 0; + + VkPhysicalDeviceMemoryProperties memory_props{}; + + VkCommandPool command_pool = VK_NULL_HANDLE; + VkCommandBuffer command_buffer = VK_NULL_HANDLE; + + PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR_ = nullptr; + PFN_vkCreateAccelerationStructureKHR vkCreateAccelerationStructureKHR_ = nullptr; + PFN_vkDestroyAccelerationStructureKHR vkDestroyAccelerationStructureKHR_ = nullptr; + PFN_vkGetAccelerationStructureBuildSizesKHR vkGetAccelerationStructureBuildSizesKHR_ = nullptr; + PFN_vkCmdBuildAccelerationStructuresKHR vkCmdBuildAccelerationStructuresKHR_ = nullptr; + PFN_vkGetAccelerationStructureDeviceAddressKHR vkGetAccelerationStructureDeviceAddressKHR_ = nullptr; + PFN_vkCmdWriteAccelerationStructuresPropertiesKHR vkCmdWriteAccelerationStructuresPropertiesKHR_ = nullptr; + + buffer_t vertices; + buffer_t indices; + buffer_t instances; + as_t blas; + as_t tlas; + + VkDescriptorSetLayout descriptor_set_layout = VK_NULL_HANDLE; + VkPipelineLayout pipeline_layout = VK_NULL_HANDLE; + VkPipeline pipeline = VK_NULL_HANDLE; + VkDescriptorPool descriptor_pool = VK_NULL_HANDLE; + VkDescriptorSet descriptor_set = VK_NULL_HANDLE; + + VkDescriptorSetLayout direct_descriptor_set_layout = VK_NULL_HANDLE; + VkPipelineLayout direct_pipeline_layout = VK_NULL_HANDLE; + VkPipeline direct_pipeline = VK_NULL_HANDLE; + VkDescriptorPool direct_descriptor_pool = VK_NULL_HANDLE; + VkDescriptorSet direct_descriptor_set = VK_NULL_HANDLE; + + std::size_t triangle_count = 0; + bool has_filtered_embree_geometry = false; +}; + +std::mutex g_mutex; +context_t g; + +static std::string vk_result_string(VkResult r) { + switch (r) { + case VK_SUCCESS: return "VK_SUCCESS"; + case VK_NOT_READY: return "VK_NOT_READY"; + case VK_TIMEOUT: return "VK_TIMEOUT"; + case VK_EVENT_SET: return "VK_EVENT_SET"; + case VK_EVENT_RESET: return "VK_EVENT_RESET"; + case VK_INCOMPLETE: return "VK_INCOMPLETE"; + case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY"; + case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED"; + case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST"; + case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED"; + case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT"; + case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT"; + case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT"; + case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER"; + default: return "VkResult(" + std::to_string(static_cast(r)) + ")"; + } +} + +static bool check(VkResult r, const char *what, std::string &error) { + if (r == VK_SUCCESS) return true; + error = std::string(what) + " failed: " + vk_result_string(r); + return false; +} + +static bool has_extension(const std::vector &props, const char *name) { + return std::any_of(props.begin(), props.end(), [&](const VkExtensionProperties &p) { + return std::strcmp(p.extensionName, name) == 0; + }); +} + +static void destroy_buffer(buffer_t &b) { + if (b.buffer) vkDestroyBuffer(g.device, b.buffer, nullptr); + if (b.memory) vkFreeMemory(g.device, b.memory, nullptr); + b = {}; +} + +static void destroy_as(as_t &a) { + if (a.as) g.vkDestroyAccelerationStructureKHR_(g.device, a.as, nullptr); + destroy_buffer(a.storage); + a = {}; +} + +static void destroy_locked() { + if (g.device) vkDeviceWaitIdle(g.device); + + if (g.direct_pipeline) vkDestroyPipeline(g.device, g.direct_pipeline, nullptr); + if (g.direct_pipeline_layout) vkDestroyPipelineLayout(g.device, g.direct_pipeline_layout, nullptr); + if (g.direct_descriptor_pool) vkDestroyDescriptorPool(g.device, g.direct_descriptor_pool, nullptr); + if (g.direct_descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.direct_descriptor_set_layout, nullptr); + + if (g.pipeline) vkDestroyPipeline(g.device, g.pipeline, nullptr); + if (g.pipeline_layout) vkDestroyPipelineLayout(g.device, g.pipeline_layout, nullptr); + if (g.descriptor_pool) vkDestroyDescriptorPool(g.device, g.descriptor_pool, nullptr); + if (g.descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.descriptor_set_layout, nullptr); + + destroy_as(g.tlas); + destroy_as(g.blas); + destroy_buffer(g.instances); + destroy_buffer(g.indices); + destroy_buffer(g.vertices); + + if (g.command_pool) vkDestroyCommandPool(g.device, g.command_pool, nullptr); + if (g.device) vkDestroyDevice(g.device, nullptr); + if (g.instance) vkDestroyInstance(g.instance, nullptr); + g = {}; +} + +static bool find_memory_type(std::uint32_t type_bits, VkMemoryPropertyFlags props, std::uint32_t &type_index) { + for (std::uint32_t i = 0; i < g.memory_props.memoryTypeCount; ++i) { + if ((type_bits & (1u << i)) && ((g.memory_props.memoryTypes[i].propertyFlags & props) == props)) { + type_index = i; + return true; + } + } + return false; +} + +static bool create_buffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags props, buffer_t &out, + std::string &error, const void *initial_data = nullptr) { + out = {}; + out.size = size; + + VkBufferCreateInfo bi{}; + bi.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bi.size = size; + bi.usage = usage; + bi.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + if (!check(vkCreateBuffer(g.device, &bi, nullptr, &out.buffer), "vkCreateBuffer", error)) return false; + + VkMemoryRequirements req{}; + vkGetBufferMemoryRequirements(g.device, out.buffer, &req); + + std::uint32_t mem_type = 0; + if (!find_memory_type(req.memoryTypeBits, props, mem_type)) { + error = "no compatible Vulkan memory type for buffer"; + destroy_buffer(out); + return false; + } + + VkMemoryAllocateFlagsInfo flags{}; + flags.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO; + flags.flags = (usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) ? VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT : 0; + + VkMemoryAllocateInfo ai{}; + ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + ai.pNext = flags.flags ? &flags : nullptr; + ai.allocationSize = req.size; + ai.memoryTypeIndex = mem_type; + + if (!check(vkAllocateMemory(g.device, &ai, nullptr, &out.memory), "vkAllocateMemory", error)) { + destroy_buffer(out); + return false; + } + if (!check(vkBindBufferMemory(g.device, out.buffer, out.memory, 0), "vkBindBufferMemory", error)) { + destroy_buffer(out); + return false; + } + + if (initial_data) { + void *mapped = nullptr; + if (!check(vkMapMemory(g.device, out.memory, 0, size, 0, &mapped), "vkMapMemory", error)) { + destroy_buffer(out); + return false; + } + std::memcpy(mapped, initial_data, static_cast(size)); + vkUnmapMemory(g.device, out.memory); + } + + return true; +} + +static VkDeviceAddress buffer_address(const buffer_t &b) { + VkBufferDeviceAddressInfo info{}; + info.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO; + info.buffer = b.buffer; + return g.vkGetBufferDeviceAddressKHR_(g.device, &info); +} + +static bool one_time_submit(const std::function &record, std::string &error) { + if (!check(vkResetCommandBuffer(g.command_buffer, 0), "vkResetCommandBuffer", error)) return false; + + VkCommandBufferBeginInfo bi{}; + bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + if (!check(vkBeginCommandBuffer(g.command_buffer, &bi), "vkBeginCommandBuffer", error)) return false; + record(g.command_buffer); + if (!check(vkEndCommandBuffer(g.command_buffer), "vkEndCommandBuffer", error)) return false; + + VkSubmitInfo si{}; + si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + si.commandBufferCount = 1; + si.pCommandBuffers = &g.command_buffer; + if (!check(vkQueueSubmit(g.queue, 1, &si, VK_NULL_HANDLE), "vkQueueSubmit", error)) return false; + if (!check(vkQueueWaitIdle(g.queue), "vkQueueWaitIdle", error)) return false; + return true; +} + +static bool create_instance(std::string &error) { + VkApplicationInfo app{}; + app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + app.pApplicationName = "ericw-tools light gpu"; + app.applicationVersion = VK_MAKE_VERSION(0, 2, 0); + app.pEngineName = "ericw-tools"; + app.engineVersion = VK_MAKE_VERSION(0, 2, 0); + app.apiVersion = VK_API_VERSION_1_2; + + VkInstanceCreateInfo ci{}; + ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + ci.pApplicationInfo = &app; + return check(vkCreateInstance(&ci, nullptr, &g.instance), "vkCreateInstance", error); +} + +static bool pick_device(std::string &error) { + std::uint32_t count = 0; + if (!check(vkEnumeratePhysicalDevices(g.instance, &count, nullptr), "vkEnumeratePhysicalDevices(count)", error)) return false; + if (!count) { error = "no Vulkan physical devices found"; return false; } + + std::vector devices(count); + if (!check(vkEnumeratePhysicalDevices(g.instance, &count, devices.data()), "vkEnumeratePhysicalDevices(list)", error)) return false; + + for (VkPhysicalDevice dev : devices) { + std::uint32_t ext_count = 0; + vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, nullptr); + std::vector exts(ext_count); + vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, exts.data()); + + if (!has_extension(exts, VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_RAY_QUERY_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME)) { + continue; + } + + VkPhysicalDeviceBufferDeviceAddressFeatures bda{}; + bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES; + VkPhysicalDeviceRayQueryFeaturesKHR rq{}; + rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR; + rq.pNext = &bda; + VkPhysicalDeviceAccelerationStructureFeaturesKHR as{}; + as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR; + as.pNext = &rq; + VkPhysicalDeviceFeatures2 f2{}; + f2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + f2.pNext = &as; + vkGetPhysicalDeviceFeatures2(dev, &f2); + if (!as.accelerationStructure || !rq.rayQuery || !bda.bufferDeviceAddress) continue; + + std::uint32_t q_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, nullptr); + std::vector qs(q_count); + vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, qs.data()); + for (std::uint32_t i = 0; i < q_count; ++i) { + if (qs[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { + g.physical = dev; + g.queue_family = i; + vkGetPhysicalDeviceMemoryProperties(dev, &g.memory_props); + return true; + } + } + } + + error = "no Vulkan device with acceleration_structure + ray_query + buffer_device_address + compute queue found"; + return false; +} + +static bool create_device(std::string &error) { + float priority = 1.0f; + VkDeviceQueueCreateInfo qci{}; + qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + qci.queueFamilyIndex = g.queue_family; + qci.queueCount = 1; + qci.pQueuePriorities = &priority; + + VkPhysicalDeviceBufferDeviceAddressFeatures bda{}; + bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES; + bda.bufferDeviceAddress = VK_TRUE; + + VkPhysicalDeviceRayQueryFeaturesKHR rq{}; + rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR; + rq.rayQuery = VK_TRUE; + rq.pNext = &bda; + + VkPhysicalDeviceAccelerationStructureFeaturesKHR as{}; + as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR; + as.accelerationStructure = VK_TRUE; + as.pNext = &rq; + + const char *extensions[] = { + VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME, + VK_KHR_RAY_QUERY_EXTENSION_NAME, + VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME, + VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME, + }; + + VkDeviceCreateInfo dci{}; + dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + dci.pNext = &as; + dci.queueCreateInfoCount = 1; + dci.pQueueCreateInfos = &qci; + dci.enabledExtensionCount = static_cast(sizeof(extensions) / sizeof(extensions[0])); + dci.ppEnabledExtensionNames = extensions; + + if (!check(vkCreateDevice(g.physical, &dci, nullptr, &g.device), "vkCreateDevice", error)) return false; + vkGetDeviceQueue(g.device, g.queue_family, 0, &g.queue); + +#define LOAD_DEVICE_PROC(name) \ + g.name##_ = reinterpret_cast(vkGetDeviceProcAddr(g.device, #name)); \ + if (!g.name##_) { error = "missing device proc " #name; return false; } + LOAD_DEVICE_PROC(vkGetBufferDeviceAddressKHR); + LOAD_DEVICE_PROC(vkCreateAccelerationStructureKHR); + LOAD_DEVICE_PROC(vkDestroyAccelerationStructureKHR); + LOAD_DEVICE_PROC(vkGetAccelerationStructureBuildSizesKHR); + LOAD_DEVICE_PROC(vkCmdBuildAccelerationStructuresKHR); + LOAD_DEVICE_PROC(vkGetAccelerationStructureDeviceAddressKHR); +#undef LOAD_DEVICE_PROC + + VkCommandPoolCreateInfo pci{}; + pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + pci.queueFamilyIndex = g.queue_family; + pci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + if (!check(vkCreateCommandPool(g.device, &pci, nullptr, &g.command_pool), "vkCreateCommandPool", error)) return false; + + VkCommandBufferAllocateInfo cai{}; + cai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + cai.commandPool = g.command_pool; + cai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + cai.commandBufferCount = 1; + if (!check(vkAllocateCommandBuffers(g.device, &cai, &g.command_buffer), "vkAllocateCommandBuffers", error)) return false; + + return true; +} + +static bool gather_geometry(const mbsp_t *bsp, std::vector &vertices, std::vector &indices, std::string &error) { + vertices.clear(); + indices.clear(); + + const auto &faces = ShadowCastingSolidFacesSet(); + if (faces.empty()) { + error = "no shadow-casting solid faces found for GPU BLAS; call Embree_TraceInit before GPU_TraceInit"; + return false; + } + + for (const mface_t *face : faces) { + if (!face || face->numedges < 3) continue; + const modelinfo_t *modelinfo = ModelInfoForFace(bsp, Face_GetNum(bsp, face)); + if (!modelinfo) continue; + + for (int j = 2; j < face->numedges; ++j) { + const int v0 = Face_VertexAtIndex(bsp, face, j - 1); + const int v1 = Face_VertexAtIndex(bsp, face, j); + const int v2 = Face_VertexAtIndex(bsp, face, 0); + const qvec3f p0 = Vertex_GetPos(bsp, v0) + modelinfo->offset; + const qvec3f p1 = Vertex_GetPos(bsp, v1) + modelinfo->offset; + const qvec3f p2 = Vertex_GetPos(bsp, v2) + modelinfo->offset; + + const std::uint32_t base = static_cast(vertices.size()); + vertices.push_back({p0[0], p0[1], p0[2]}); + vertices.push_back({p1[0], p1[1], p1[2]}); + vertices.push_back({p2[0], p2[1], p2[2]}); + indices.push_back(base + 0); + indices.push_back(base + 1); + indices.push_back(base + 2); + } + } + + if (indices.empty()) { + error = "GPU geometry gather produced zero triangles"; + return false; + } + return true; +} + +static bool create_acceleration_structure(VkAccelerationStructureTypeKHR type, VkDeviceSize size, as_t &out, std::string &error) { + if (!create_buffer(size, + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + out.storage, + error)) return false; + + VkAccelerationStructureCreateInfoKHR ci{}; + ci.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR; + ci.type = type; + ci.size = size; + ci.buffer = out.storage.buffer; + if (!check(g.vkCreateAccelerationStructureKHR_(g.device, &ci, nullptr, &out.as), "vkCreateAccelerationStructureKHR", error)) return false; + + VkAccelerationStructureDeviceAddressInfoKHR ai{}; + ai.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_DEVICE_ADDRESS_INFO_KHR; + ai.accelerationStructure = out.as; + out.address = g.vkGetAccelerationStructureDeviceAddressKHR_(g.device, &ai); + return true; +} + +static bool build_blas(const std::vector &vertices, const std::vector &indices, std::string &error) { + if (!create_buffer(sizeof(vertex_t) * vertices.size(), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.vertices, + error, + vertices.data())) return false; + + if (!create_buffer(sizeof(std::uint32_t) * indices.size(), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.indices, + error, + indices.data())) return false; + + VkDeviceAddress vertex_addr = buffer_address(g.vertices); + VkDeviceAddress index_addr = buffer_address(g.indices); + + VkAccelerationStructureGeometryKHR geom{}; + geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR; + geom.geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR; + geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR; + geom.geometry.triangles.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_TRIANGLES_DATA_KHR; + geom.geometry.triangles.vertexFormat = VK_FORMAT_R32G32B32_SFLOAT; + geom.geometry.triangles.vertexData.deviceAddress = vertex_addr; + geom.geometry.triangles.vertexStride = sizeof(vertex_t); + geom.geometry.triangles.maxVertex = static_cast(vertices.size() - 1); + geom.geometry.triangles.indexType = VK_INDEX_TYPE_UINT32; + geom.geometry.triangles.indexData.deviceAddress = index_addr; + + const std::uint32_t prim_count = static_cast(indices.size() / 3); + g.triangle_count = prim_count; + + VkAccelerationStructureBuildGeometryInfoKHR build{}; + build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR; + build.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; + build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; + build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; + build.geometryCount = 1; + build.pGeometries = &geom; + + VkAccelerationStructureBuildSizesInfoKHR sizes{}; + sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR; + g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes); + + if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR, sizes.accelerationStructureSize, g.blas, error)) return false; + + buffer_t scratch; + if (!create_buffer(sizes.buildScratchSize, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + scratch, + error)) return false; + + build.dstAccelerationStructure = g.blas.as; + build.scratchData.deviceAddress = buffer_address(scratch); + + VkAccelerationStructureBuildRangeInfoKHR range{}; + range.primitiveCount = prim_count; + const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = ⦥ + + bool ok = one_time_submit([&](VkCommandBuffer cmd) { + g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr); + }, error); + + destroy_buffer(scratch); + return ok; +} + +static bool build_tlas(std::string &error) { + VkAccelerationStructureInstanceKHR inst{}; + inst.transform.matrix[0][0] = 1.0f; + inst.transform.matrix[1][1] = 1.0f; + inst.transform.matrix[2][2] = 1.0f; + inst.instanceCustomIndex = 0; + inst.mask = 0xff; + inst.instanceShaderBindingTableRecordOffset = 0; + inst.flags = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR; + inst.accelerationStructureReference = g.blas.address; + + if (!create_buffer(sizeof(inst), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.instances, + error, + &inst)) return false; + + VkAccelerationStructureGeometryKHR geom{}; + geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR; + geom.geometryType = VK_GEOMETRY_TYPE_INSTANCES_KHR; + geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR; + geom.geometry.instances.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR; + geom.geometry.instances.arrayOfPointers = VK_FALSE; + geom.geometry.instances.data.deviceAddress = buffer_address(g.instances); + + const std::uint32_t prim_count = 1; + + VkAccelerationStructureBuildGeometryInfoKHR build{}; + build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR; + build.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR; + build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; + build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; + build.geometryCount = 1; + build.pGeometries = &geom; + + VkAccelerationStructureBuildSizesInfoKHR sizes{}; + sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR; + g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes); + + if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR, sizes.accelerationStructureSize, g.tlas, error)) return false; + + buffer_t scratch; + if (!create_buffer(sizes.buildScratchSize, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + scratch, + error)) return false; + + build.dstAccelerationStructure = g.tlas.as; + build.scratchData.deviceAddress = buffer_address(scratch); + + VkAccelerationStructureBuildRangeInfoKHR range{}; + range.primitiveCount = prim_count; + const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = ⦥ + + bool ok = one_time_submit([&](VkCommandBuffer cmd) { + g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr); + }, error); + + destroy_buffer(scratch); + return ok; +} + +static std::filesystem::path exe_dir() { +#if defined(__linux__) + std::array buf{}; + ssize_t len = readlink("/proc/self/exe", buf.data(), buf.size() - 1); + if (len > 0) { + buf[static_cast(len)] = '\0'; + return std::filesystem::path(buf.data()).parent_path(); + } +#endif + return std::filesystem::current_path(); +} + +static bool read_file(const std::filesystem::path &path, std::vector &words, std::string &error) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { error = "could not open shader: " + path.string(); return false; } + const std::streamsize size = f.tellg(); + if (size <= 0 || (size % 4) != 0) { error = "shader has invalid SPIR-V size: " + path.string(); return false; } + f.seekg(0, std::ios::beg); + words.resize(static_cast(size / 4)); + if (!f.read(reinterpret_cast(words.data()), size)) { error = "failed to read shader: " + path.string(); return false; } + return true; +} + +static bool create_pipeline(std::string &error) { + VkDescriptorSetLayoutBinding b0{}; + b0.binding = 0; + b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + b0.descriptorCount = 1; + b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b1{}; + b1.binding = 1; + b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b1.descriptorCount = 1; + b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b2{}; + b2.binding = 2; + b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b2.descriptorCount = 1; + b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + std::array bindings{b0, b1, b2}; + VkDescriptorSetLayoutCreateInfo dlci{}; + dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + dlci.bindingCount = static_cast(bindings.size()); + dlci.pBindings = bindings.data(); + if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.descriptor_set_layout), "vkCreateDescriptorSetLayout", error)) return false; + + VkPushConstantRange pcr{}; + pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pcr.offset = 0; + pcr.size = sizeof(push_constants_t); + + VkPipelineLayoutCreateInfo plci{}; + plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + plci.setLayoutCount = 1; + plci.pSetLayouts = &g.descriptor_set_layout; + plci.pushConstantRangeCount = 1; + plci.pPushConstantRanges = &pcr; + if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.pipeline_layout), "vkCreatePipelineLayout", error)) return false; + + std::vector spv; + const auto shader_path = exe_dir() / "gpu_shaders" / "occlusion.comp.spv"; + if (!read_file(shader_path, spv, error)) return false; + + VkShaderModuleCreateInfo smci{}; + smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + smci.codeSize = spv.size() * sizeof(std::uint32_t); + smci.pCode = spv.data(); + VkShaderModule shader = VK_NULL_HANDLE; + if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule", error)) return false; + + VkComputePipelineCreateInfo cpci{}; + cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + cpci.stage.module = shader; + cpci.stage.pName = "main"; + cpci.layout = g.pipeline_layout; + bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.pipeline), "vkCreateComputePipelines", error); + vkDestroyShaderModule(g.device, shader, nullptr); + if (!ok) return false; + + VkDescriptorPoolSize ps0{}; + ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + ps0.descriptorCount = 1; + VkDescriptorPoolSize ps1{}; + ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + ps1.descriptorCount = 2; + std::array sizes{ps0, ps1}; + + VkDescriptorPoolCreateInfo dpci{}; + dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + dpci.maxSets = 1; + dpci.poolSizeCount = static_cast(sizes.size()); + dpci.pPoolSizes = sizes.data(); + if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.descriptor_pool), "vkCreateDescriptorPool", error)) return false; + + VkDescriptorSetAllocateInfo dsai{}; + dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsai.descriptorPool = g.descriptor_pool; + dsai.descriptorSetCount = 1; + dsai.pSetLayouts = &g.descriptor_set_layout; + if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.descriptor_set), "vkAllocateDescriptorSets", error)) return false; + + return true; +} + +static bool create_direct_pipeline(std::string &error) { + VkDescriptorSetLayoutBinding b0{}; + b0.binding = 0; + b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + b0.descriptorCount = 1; + b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b1{}; + b1.binding = 1; + b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b1.descriptorCount = 1; + b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b2{}; + b2.binding = 2; + b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b2.descriptorCount = 1; + b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b3{}; + b3.binding = 3; + b3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b3.descriptorCount = 1; + b3.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + std::array bindings{b0, b1, b2, b3}; + VkDescriptorSetLayoutCreateInfo dlci{}; + dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + dlci.bindingCount = static_cast(bindings.size()); + dlci.pBindings = bindings.data(); + if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.direct_descriptor_set_layout), "vkCreateDescriptorSetLayout(direct)", error)) return false; + + VkPushConstantRange pcr{}; + pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pcr.offset = 0; + pcr.size = sizeof(direct_push_constants_t); + + VkPipelineLayoutCreateInfo plci{}; + plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + plci.setLayoutCount = 1; + plci.pSetLayouts = &g.direct_descriptor_set_layout; + plci.pushConstantRangeCount = 1; + plci.pPushConstantRanges = &pcr; + if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.direct_pipeline_layout), "vkCreatePipelineLayout(direct)", error)) return false; + + std::vector spv; + const auto shader_path = exe_dir() / "gpu_shaders" / "direct_phase.comp.spv"; + if (!read_file(shader_path, spv, error)) return false; + + VkShaderModuleCreateInfo smci{}; + smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + smci.codeSize = spv.size() * sizeof(std::uint32_t); + smci.pCode = spv.data(); + VkShaderModule shader = VK_NULL_HANDLE; + if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule(direct)", error)) return false; + + VkComputePipelineCreateInfo cpci{}; + cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + cpci.stage.module = shader; + cpci.stage.pName = "main"; + cpci.layout = g.direct_pipeline_layout; + bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.direct_pipeline), "vkCreateComputePipelines(direct)", error); + vkDestroyShaderModule(g.device, shader, nullptr); + if (!ok) return false; + + VkDescriptorPoolSize ps0{}; + ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + ps0.descriptorCount = 1; + VkDescriptorPoolSize ps1{}; + ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + ps1.descriptorCount = 3; + std::array sizes{ps0, ps1}; + + VkDescriptorPoolCreateInfo dpci{}; + dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + dpci.maxSets = 1; + dpci.poolSizeCount = static_cast(sizes.size()); + dpci.pPoolSizes = sizes.data(); + if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.direct_descriptor_pool), "vkCreateDescriptorPool(direct)", error)) return false; + + VkDescriptorSetAllocateInfo dsai{}; + dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsai.descriptorPool = g.direct_descriptor_pool; + dsai.descriptorSetCount = 1; + dsai.pSetLayouts = &g.direct_descriptor_set_layout; + if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.direct_descriptor_set), "vkAllocateDescriptorSets(direct)", error)) return false; + + return true; +} + +static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffer_t &range_buffer, const buffer_t &accum_buffer) { + VkWriteDescriptorSetAccelerationStructureKHR as_info{}; + as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; + as_info.accelerationStructureCount = 1; + as_info.pAccelerationStructures = &g.tlas.as; + + VkWriteDescriptorSet w0{}; + w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w0.pNext = &as_info; + w0.dstSet = g.direct_descriptor_set; + w0.dstBinding = 0; + w0.descriptorCount = 1; + w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + + VkDescriptorBufferInfo job_info{}; + job_info.buffer = job_buffer.buffer; + job_info.offset = 0; + job_info.range = job_buffer.size; + VkWriteDescriptorSet w1{}; + w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w1.dstSet = g.direct_descriptor_set; + w1.dstBinding = 1; + w1.descriptorCount = 1; + w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w1.pBufferInfo = &job_info; + + VkDescriptorBufferInfo range_info{}; + range_info.buffer = range_buffer.buffer; + range_info.offset = 0; + range_info.range = range_buffer.size; + VkWriteDescriptorSet w2{}; + w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w2.dstSet = g.direct_descriptor_set; + w2.dstBinding = 2; + w2.descriptorCount = 1; + w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w2.pBufferInfo = &range_info; + + VkDescriptorBufferInfo accum_info{}; + accum_info.buffer = accum_buffer.buffer; + accum_info.offset = 0; + accum_info.range = accum_buffer.size; + VkWriteDescriptorSet w3{}; + w3.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w3.dstSet = g.direct_descriptor_set; + w3.dstBinding = 3; + w3.descriptorCount = 1; + w3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w3.pBufferInfo = &accum_info; + + std::array writes{w0, w1, w2, w3}; + vkUpdateDescriptorSets(g.device, static_cast(writes.size()), writes.data(), 0, nullptr); +} + +static void update_descriptor_set(const buffer_t &ray_buffer, const buffer_t &result_buffer) { + VkWriteDescriptorSetAccelerationStructureKHR as_info{}; + as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; + as_info.accelerationStructureCount = 1; + as_info.pAccelerationStructures = &g.tlas.as; + + VkWriteDescriptorSet w0{}; + w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w0.pNext = &as_info; + w0.dstSet = g.descriptor_set; + w0.dstBinding = 0; + w0.descriptorCount = 1; + w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + + VkDescriptorBufferInfo ray_info{}; + ray_info.buffer = ray_buffer.buffer; + ray_info.offset = 0; + ray_info.range = ray_buffer.size; + VkWriteDescriptorSet w1{}; + w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w1.dstSet = g.descriptor_set; + w1.dstBinding = 1; + w1.descriptorCount = 1; + w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w1.pBufferInfo = &ray_info; + + VkDescriptorBufferInfo result_info{}; + result_info.buffer = result_buffer.buffer; + result_info.offset = 0; + result_info.range = result_buffer.size; + VkWriteDescriptorSet w2{}; + w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w2.dstSet = g.descriptor_set; + w2.dstBinding = 2; + w2.descriptorCount = 1; + w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w2.pBufferInfo = &result_info; + + std::array writes{w0, w1, w2}; + vkUpdateDescriptorSets(g.device, static_cast(writes.size()), writes.data(), 0, nullptr); +} + +} // namespace + +bool init(const mbsp_t *bsp, std::string &error) { + std::lock_guard lock(g_mutex); + destroy_locked(); + + if (!create_instance(error)) { destroy_locked(); return false; } + if (!pick_device(error)) { destroy_locked(); return false; } + if (!create_device(error)) { destroy_locked(); return false; } + + g.has_filtered_embree_geometry = !filtergeom.triInfo.empty(); + if (g.has_filtered_embree_geometry) { + logging::print("GPU light: filtered Embree geometry exists ({} tris); GPU will fall back for correctness.\n", filtergeom.triInfo.size()); + } + + std::vector vertices; + std::vector indices; + if (!gather_geometry(bsp, vertices, indices, error)) { destroy_locked(); return false; } + if (!build_blas(vertices, indices, error)) { destroy_locked(); return false; } + if (!build_tlas(error)) { destroy_locked(); return false; } + if (!create_pipeline(error)) { destroy_locked(); return false; } + if (!create_direct_pipeline(error)) { destroy_locked(); return false; } + + logging::print("GPU light: Vulkan ray-query BLAS/TLAS ready ({} opaque triangles).\n", g.triangle_count); + return true; +} + +void shutdown() { + std::lock_guard lock(g_mutex); + destroy_locked(); +} + +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::ray_t *rays, + gpu_light::occlusion_result_t *results, + std::size_t count, + std::string &error) { + std::lock_guard lock(g_mutex); + if (!g.device || !g.pipeline || !g.tlas.as) { + error = "Vulkan GPU backend is not initialized"; + return false; + } + + // Correctness guard: the GPU fast path only contains opaque solid/default geometry. + // If Embree has filtered geometry, let CPU handle batches so glass/fence/dynamic/channel filters remain correct. + if (g.has_filtered_embree_geometry) { + return false; + } + if (shadow_mask != CHANNEL_MASK_DEFAULT) { + return false; + } + (void)self; + + std::vector gpu_rays(count); + for (std::size_t i = 0; i < count; ++i) { + gpu_rays[i].ox = rays[i].origin[0]; + gpu_rays[i].oy = rays[i].origin[1]; + gpu_rays[i].oz = rays[i].origin[2]; + gpu_rays[i].tmin = rays[i].tmin; + gpu_rays[i].dx = rays[i].direction[0]; + gpu_rays[i].dy = rays[i].direction[1]; + gpu_rays[i].dz = rays[i].direction[2]; + gpu_rays[i].tmax = rays[i].tmax; + gpu_rays[i].shadow_mask = rays[i].shadow_mask; + gpu_rays[i].user_index = rays[i].user_index; + } + + buffer_t ray_buffer; + buffer_t result_buffer; + std::vector zero_results(count); + + bool ok = create_buffer(sizeof(gpu_ray_host_t) * count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + ray_buffer, + error, + gpu_rays.data()); + if (!ok) return false; + + ok = create_buffer(sizeof(gpu_result_host_t) * count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + result_buffer, + error, + zero_results.data()); + if (!ok) { + destroy_buffer(ray_buffer); + return false; + } + + update_descriptor_set(ray_buffer, result_buffer); + + push_constants_t pc{}; + pc.ray_count = static_cast(count); + pc.flags = 0; + + ok = one_time_submit([&](VkCommandBuffer cmd) { + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline_layout, 0, 1, &g.descriptor_set, 0, nullptr); + vkCmdPushConstants(cmd, g.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cmd, (pc.ray_count + 127u) / 128u, 1, 1); + }, error); + + if (ok) { + void *mapped = nullptr; + ok = check(vkMapMemory(g.device, result_buffer.memory, 0, result_buffer.size, 0, &mapped), "vkMapMemory(result)", error); + if (ok) { + const auto *gpu_results = static_cast(mapped); + for (std::size_t i = 0; i < count; ++i) { + results[i].occluded = gpu_results[i].occluded; + results[i].reserved0 = gpu_results[i].reserved0; + results[i].transmittance[0] = gpu_results[i].tr; + results[i].transmittance[1] = gpu_results[i].tg; + results[i].transmittance[2] = gpu_results[i].tb; + } + vkUnmapMemory(g.device, result_buffer.memory); + } + } + + destroy_buffer(result_buffer); + destroy_buffer(ray_buffer); + return ok; +} + + +bool trace_direct_phase_batch( + const gpu_light::direct_phase_source_t *sources, + std::size_t source_count, + const gpu_light::direct_phase_sample_t *samples, + gpu_light::direct_phase_accum_t *accum, + std::size_t sample_count, + std::string &error) { + std::lock_guard lock(g_mutex); + if (!g.device || !g.direct_pipeline || !g.tlas.as) { + error = "Vulkan GPU direct phase backend is not initialized"; + return false; + } + if (g.has_filtered_embree_geometry) { + return false; + } + if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) { + return true; + } + + std::vector gpu_samples(sample_count); + for (std::size_t i = 0; i < sample_count; ++i) { + gpu_samples[i].px = samples[i].px; + gpu_samples[i].py = samples[i].py; + gpu_samples[i].pz = samples[i].pz; + gpu_samples[i].occlusion = samples[i].occlusion; + gpu_samples[i].nx = samples[i].nx; + gpu_samples[i].ny = samples[i].ny; + gpu_samples[i].nz = samples[i].nz; + gpu_samples[i].twosided = samples[i].twosided; + } + + std::vector gpu_sources(source_count); + for (std::size_t i = 0; i < source_count; ++i) { + gpu_sources[i].px = sources[i].px; + gpu_sources[i].py = sources[i].py; + gpu_sources[i].pz = sources[i].pz; + gpu_sources[i].light = sources[i].light; + gpu_sources[i].dx = sources[i].dx; + gpu_sources[i].dy = sources[i].dy; + gpu_sources[i].dz = sources[i].dz; + gpu_sources[i].dist = sources[i].dist; + gpu_sources[i].cr = sources[i].cr; + gpu_sources[i].cg = sources[i].cg; + gpu_sources[i].cb = sources[i].cb; + gpu_sources[i].atten = sources[i].atten; + gpu_sources[i].type = sources[i].type; + gpu_sources[i].formula = sources[i].formula; + gpu_sources[i].flags = sources[i].flags; + gpu_sources[i].reserved0 = 0; + gpu_sources[i].anglescale = sources[i].anglescale; + gpu_sources[i].dirt = sources[i].dirt; + gpu_sources[i].falloff = sources[i].falloff; + gpu_sources[i].pad0 = 0.0f; + } + + std::vector zero_accum(sample_count); + + buffer_t sample_buffer; + buffer_t source_buffer; + buffer_t accum_buffer; + + bool ok = create_buffer(sizeof(gpu_direct_phase_sample_host_t) * sample_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + sample_buffer, + error, + gpu_samples.data()); + if (!ok) return false; + + ok = create_buffer(sizeof(gpu_direct_phase_source_host_t) * source_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + source_buffer, + error, + gpu_sources.data()); + if (!ok) { + destroy_buffer(sample_buffer); + return false; + } + + ok = create_buffer(sizeof(gpu_direct_accum_host_t) * sample_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + accum_buffer, + error, + zero_accum.data()); + if (!ok) { + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + + update_direct_descriptor_set(sample_buffer, source_buffer, accum_buffer); + + direct_push_constants_t pc{}; + pc.sample_count = static_cast(sample_count); + pc.source_count = static_cast(source_count); + pc.flags = 0; + pc.reserved0 = 0; + + ok = one_time_submit([&](VkCommandBuffer cmd) { + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline_layout, 0, 1, &g.direct_descriptor_set, 0, nullptr); + vkCmdPushConstants(cmd, g.direct_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cmd, (pc.sample_count + 63u) / 64u, 1, 1); + }, error); + + if (ok) { + void *mapped = nullptr; + ok = check(vkMapMemory(g.device, accum_buffer.memory, 0, accum_buffer.size, 0, &mapped), "vkMapMemory(direct phase accum)", error); + if (ok) { + const auto *gpu_accum = static_cast(mapped); + for (std::size_t i = 0; i < sample_count; ++i) { + accum[i].cr = gpu_accum[i].cr; + accum[i].cg = gpu_accum[i].cg; + accum[i].cb = gpu_accum[i].cb; + accum[i].pad0 = 0.0f; + accum[i].nr = gpu_accum[i].nr; + accum[i].ng = gpu_accum[i].ng; + accum[i].nb = gpu_accum[i].nb; + accum[i].pad1 = 0.0f; + accum[i].hit = gpu_accum[i].hit; + accum[i].reserved0 = 0; + accum[i].reserved1 = 0; + accum[i].reserved2 = 0; + } + vkUnmapMemory(g.device, accum_buffer.memory); + } + } + + destroy_buffer(accum_buffer); + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return ok; +} + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::direct_job_t *jobs, + std::size_t job_count, + const gpu_light::direct_sample_range_t *ranges, + gpu_light::direct_accum_t *accum, + std::size_t sample_count, + std::string &error) { + (void)self; + (void)shadow_mask; + (void)jobs; + (void)job_count; + (void)ranges; + (void)accum; + (void)sample_count; + error = "old direct job buffer path disabled in v5; use trace_direct_phase_batch"; + return false; +} + +} // namespace gpu_light::vulkan_backend + +#endif // HAVE_GPU_LIGHT From dd771a8b174bfc5964dbbfa775f9e1d26a042068 Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 03:57:28 +0900 Subject: [PATCH 2/7] clean up comments --- include/light/trace_gpu.hh | 3 +-- light/trace_gpu.cc | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh index a0ecfb51..f223bb90 100644 --- a/include/light/trace_gpu.hh +++ b/include/light/trace_gpu.hh @@ -130,11 +130,10 @@ bool trace_direct_accumulate_batch( } // namespace gpu_light -// C-style wrappers are easier to call from older code paths. bool GPU_TraceInit(const mbsp_t *bsp); void GPU_TraceShutdown(); bool GPU_TraceAvailable(); const char *GPU_TraceLastError(); -// Implemented in light/ltface.cc by the v5 overlay; flushes pending sample-driven direct-light work. +// Flushes pending sample-driven direct-light work. void GPU_DirectQueue_Flush(const mbsp_t *bsp); diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc index e20a33fd..71a6e8e8 100644 --- a/light/trace_gpu.cc +++ b/light/trace_gpu.cc @@ -45,8 +45,7 @@ stats_t g_stats; } // namespace bool requested() { - // The apply script wires this to light_options.gpu in the call site. Keeping - // this function independent avoids pulling all light settings into this TU. + // Keeping this function independent avoids pulling all light settings into this TU. return true; } From 10e154b9880ecc7978650a25354b17e5c8cf8bd5 Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 04:19:02 +0900 Subject: [PATCH 3/7] Enhance GPU direct phase processing with additional structures and Vulkan integration - Added and reserved fields to and for better tracking of face data. - Introduced and structures to manage source ranges. - Updated function signatures to include new parameters for face ranges and source indices. - Modified Vulkan descriptor set and buffer updates to accommodate new data structures. - Improved shader code to utilize face range data for more efficient light processing. --- include/light/trace_gpu.hh | 15 +- light/gpu_shaders/direct_phase.comp | 60 ++++--- light/ltface.cc | 240 +++++++++++++++++++++++++--- light/trace_gpu.cc | 24 ++- light/trace_gpu_vulkan.cc | 143 ++++++++++++++--- 5 files changed, 418 insertions(+), 64 deletions(-) diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh index f223bb90..96ac2f85 100644 --- a/include/light/trace_gpu.hh +++ b/include/light/trace_gpu.hh @@ -61,6 +61,15 @@ struct direct_accum_t { struct direct_phase_sample_t { float px = 0, py = 0, pz = 0, occlusion = 1; float nx = 0, ny = 0, nz = 1, twosided = 0; + std::uint32_t face_index = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; + std::uint32_t reserved2 = 0; +}; + +struct direct_phase_face_range_t { + std::uint32_t source_begin = 0; + std::uint32_t source_count = 0; }; struct direct_phase_source_t { @@ -117,7 +126,11 @@ bool trace_direct_phase_batch( std::size_t source_count, const direct_phase_sample_t *samples, direct_phase_accum_t *accum, - std::size_t sample_count); + std::size_t sample_count, + const direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count); bool trace_direct_accumulate_batch( const modelinfo_t *self, diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp index 663ba73f..43dd6b51 100644 --- a/light/gpu_shaders/direct_phase.comp +++ b/light/gpu_shaders/direct_phase.comp @@ -9,6 +9,10 @@ struct GpuDirectPhaseSample { float occlusion; vec3 normal; float twosided; + uint faceIndex; + uint reserved0; + uint reserved1; + uint reserved2; }; struct GpuDirectPhaseSource { @@ -28,6 +32,11 @@ struct GpuDirectPhaseSource { float pad0; }; +struct GpuDirectPhaseFaceRange { + uint sourceBegin; + uint sourceCount; +}; + struct GpuDirectAccum { vec3 color; float pad0; @@ -42,7 +51,9 @@ struct GpuDirectAccum { layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS; layout(std430, set = 0, binding = 1) readonly buffer Samples { GpuDirectPhaseSample samples[]; } sampleBuffer; layout(std430, set = 0, binding = 2) readonly buffer Sources { GpuDirectPhaseSource sources[]; } sourceBuffer; -layout(std430, set = 0, binding = 3) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer; +layout(std430, set = 0, binding = 3) readonly buffer FaceRanges { GpuDirectPhaseFaceRange ranges[]; } faceRangeBuffer; +layout(std430, set = 0, binding = 4) readonly buffer FaceSourceIndices { uint indices[]; } faceSourceIndexBuffer; +layout(std430, set = 0, binding = 5) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer; layout(push_constant) uniform PushConstants { uint sampleCount; @@ -72,30 +83,37 @@ float point_light_value(uint formula, float light, float atten, float dist, floa float d = max(dist, 1.0); float a = max(atten, 0.0001); - // Mirrors the broad ericw-tools delay/formula families well enough for the - // experimental GPU fast path. Exact exotic cases should stay on CPU. - if (formula == 1u) { // LF_INVERSE + if (formula == 1u) { return light * 128.0 / (d * a); - } else if (formula == 2u) { // LF_INVERSE2 + } else if (formula == 2u) { return light * 128.0 * 128.0 / (d * d * a); - } else if (formula == 3u) { // LF_INFINITE + } else if (formula == 3u) { return light; - } else if (formula == 5u) { // LF_INVERSE2A + } else if (formula == 5u) { float da = d + 128.0; return light * 128.0 * 128.0 / (da * da * a); - } else if (formula == 6u) { // LF_QRAD3-ish + } else if (formula == 6u) { float qd = max(d, 16.0); return light * 128.0 * 128.0 / (qd * qd * a); } - // LF_LINEAR. If _falloff is set, use it as the zero point. Otherwise the - // classic formula is light - distance * attenuation. if (falloff > 0.0) { return light * max(0.0, 1.0 - d / falloff); } return light - d * a; } +void clear_accum(uint sample_id) { + accumBuffer.accum[sample_id].color = vec3(0.0); + accumBuffer.accum[sample_id].pad0 = 0.0; + accumBuffer.accum[sample_id].normal = vec3(0.0); + accumBuffer.accum[sample_id].pad1 = 0.0; + accumBuffer.accum[sample_id].hit = 0u; + accumBuffer.accum[sample_id].reserved0 = 0u; + accumBuffer.accum[sample_id].reserved1 = 0u; + accumBuffer.accum[sample_id].reserved2 = 0u; +} + void main() { uint sample_id = gl_GlobalInvocationID.x; if (sample_id >= pc.sampleCount) { @@ -104,21 +122,25 @@ void main() { GpuDirectPhaseSample s = sampleBuffer.samples[sample_id]; if (s.twosided < -0.5) { - accumBuffer.accum[sample_id].color = vec3(0.0); - accumBuffer.accum[sample_id].pad0 = 0.0; - accumBuffer.accum[sample_id].normal = vec3(0.0); - accumBuffer.accum[sample_id].pad1 = 0.0; - accumBuffer.accum[sample_id].hit = 0u; - accumBuffer.accum[sample_id].reserved0 = 0u; - accumBuffer.accum[sample_id].reserved1 = 0u; - accumBuffer.accum[sample_id].reserved2 = 0u; + clear_accum(sample_id); return; } + + GpuDirectPhaseFaceRange r = faceRangeBuffer.ranges[s.faceIndex]; + if (r.sourceCount == 0u) { + clear_accum(sample_id); + return; + } + vec3 total_color = vec3(0.0); vec3 total_normal = vec3(0.0); uint any_hit = 0u; - for (uint source_id = 0u; source_id < pc.sourceCount; ++source_id) { + for (uint local_i = 0u; local_i < r.sourceCount; ++local_i) { + uint source_id = faceSourceIndexBuffer.indices[r.sourceBegin + local_i]; + if (source_id >= pc.sourceCount) { + continue; + } GpuDirectPhaseSource l = sourceBuffer.sources[source_id]; vec3 ray_dir; diff --git a/light/ltface.cc b/light/ltface.cc index fc7c0197..baddf249 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -2565,14 +2566,7 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const #if defined(HAVE_GPU_LIGHT) static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps) { - // v5 disabled: per-face GPU direct was slower than Embree. - return false; - - // v4 disabled: per-face GPU direct was slower than Embree. - return false; - // Disabled: this per-face GPU direct path is currently slower than Embree. - // It is not the final whole-phase batching architecture. return false; if (!GPU_TraceAvailable()) { @@ -2757,6 +2751,8 @@ static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, light + + #if defined(HAVE_GPU_LIGHT) namespace { struct gpu_direct_face_record_t { @@ -2769,12 +2765,154 @@ struct gpu_direct_face_record_t { std::mutex g_gpu_direct_queue_mutex; std::vector g_gpu_direct_samples; std::vector g_gpu_direct_sources; +std::vector g_gpu_direct_face_ranges; +std::vector g_gpu_direct_face_source_indices; std::vector g_gpu_direct_faces; bool g_gpu_direct_sources_built = false; bool g_gpu_direct_disabled = false; static constexpr std::size_t GPU_DIRECT_FLUSH_SAMPLES = 1024ull * 1024ull; +struct gpu_direct_source_key_t { + std::uint32_t type = 0; + std::uint32_t formula = 0; + std::uint32_t flags = 0; + int px = 0, py = 0, pz = 0; + int dx = 0, dy = 0, dz = 0; + int cr = 0, cg = 0, cb = 0; + int light = 0, atten = 0, anglescale = 0, falloff = 0; +}; + +static int GPU_Direct_Quantize(float v, float scale = 4096.0f) +{ + return static_cast(std::lround(v * scale)); +} + +static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phase_source_t &s) +{ + gpu_direct_source_key_t k{}; + k.type = s.type; + k.formula = s.formula; + k.flags = s.flags; + k.px = GPU_Direct_Quantize(s.px); + k.py = GPU_Direct_Quantize(s.py); + k.pz = GPU_Direct_Quantize(s.pz); + k.dx = GPU_Direct_Quantize(s.dx); + k.dy = GPU_Direct_Quantize(s.dy); + k.dz = GPU_Direct_Quantize(s.dz); + k.cr = GPU_Direct_Quantize(s.cr); + k.cg = GPU_Direct_Quantize(s.cg); + k.cb = GPU_Direct_Quantize(s.cb); + k.light = GPU_Direct_Quantize(s.light, 1024.0f); + k.atten = GPU_Direct_Quantize(s.atten, 1024.0f); + k.anglescale = GPU_Direct_Quantize(s.anglescale, 1024.0f); + k.falloff = GPU_Direct_Quantize(s.falloff, 1024.0f); + return k; +} + +static bool GPU_Direct_SourceKeyEquals(const gpu_direct_source_key_t &a, const gpu_direct_source_key_t &b) +{ + return a.type == b.type && a.formula == b.formula && a.flags == b.flags && + a.px == b.px && a.py == b.py && a.pz == b.pz && + a.dx == b.dx && a.dy == b.dy && a.dz == b.dz && + a.cr == b.cr && a.cg == b.cg && a.cb == b.cb && + a.light == b.light && a.atten == b.atten && + a.anglescale == b.anglescale && a.falloff == b.falloff; +} + +static void GPU_Direct_AddUniqueSource( + std::vector &keys, + const gpu_light::direct_phase_source_t &src) +{ + const auto key = GPU_Direct_SourceKey(src); + for (const auto &existing : keys) { + if (GPU_Direct_SourceKeyEquals(existing, key)) { + return; + } + } + keys.push_back(key); + g_gpu_direct_sources.push_back(src); +} + +static float GPU_Direct_EffectivePointRadius(const gpu_light::direct_phase_source_t &src) +{ + if (src.type == 1) { + return MAX_SKY_DIST; + } + if (src.formula == 3u) { // LF_INFINITE + return MAX_SKY_DIST; + } + if (src.falloff > 0.0f) { + return std::min(src.falloff, static_cast(MAX_SKY_DIST)); + } + // Conservative only for LF_LINEAR/default: value = light - distance * atten. + // Inverse formulas are treated as global unless they provide _falloff. + if (src.formula == 0u && src.atten > 0.0001f && src.light > 0.0f) { + return std::min(src.light / src.atten, static_cast(MAX_SKY_DIST)); + } + return MAX_SKY_DIST; +} + +static float GPU_Direct_PointAABBDistance2( + const gpu_light::direct_phase_source_t &src, + const qvec3f &mins, + const qvec3f &maxs) +{ + const float p[3] = {src.px, src.py, src.pz}; + float d2 = 0.0f; + for (int axis = 0; axis < 3; ++axis) { + if (p[axis] < mins[axis]) { + const float d = mins[axis] - p[axis]; + d2 += d * d; + } else if (p[axis] > maxs[axis]) { + const float d = p[axis] - maxs[axis]; + d2 += d * d; + } + } + return d2; +} + +static bool GPU_Direct_SourceAffectsFace( + const gpu_light::direct_phase_source_t &src, + const qvec3f &mins, + const qvec3f &maxs, + const qvec3f &normal, + bool twosided) +{ + if (twosided) { + return true; + } + + if (src.type == 1) { + const qvec3f dir{src.dx, src.dy, src.dz}; + return qv::dot(normal, dir) > -0.05f; + } + + const float radius = GPU_Direct_EffectivePointRadius(src); + if (radius < static_cast(MAX_SKY_DIST) * 0.999f) { + const float d2 = GPU_Direct_PointAABBDistance2(src, mins, maxs); + if (d2 > radius * radius) { + return false; + } + } + + // Conservative face-normal cull for point lights: use vector from face center to light. + const qvec3f center{ + (mins[0] + maxs[0]) * 0.5f, + (mins[1] + maxs[1]) * 0.5f, + (mins[2] + maxs[2]) * 0.5f}; + qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]}; + const float to_light_len2 = qv::dot(to_light, to_light); + if (to_light_len2 > 0.0001f) { + to_light = to_light * (1.0f / std::sqrt(to_light_len2)); + if (qv::dot(normal, to_light) <= -0.10f) { + return false; + } + } + + return true; +} + static bool GPU_DirectQueue_BuildSourcesLocked() { if (g_gpu_direct_sources_built) { @@ -2782,7 +2920,9 @@ static bool GPU_DirectQueue_BuildSourcesLocked() } g_gpu_direct_sources_built = true; g_gpu_direct_sources.clear(); + std::vector unique_keys; + std::size_t raw_sources = 0; for (const auto &entity_ptr : GetLights()) { const light_t *entity = entity_ptr.get(); if (entity->nostaticlight.value()) continue; @@ -2813,7 +2953,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked() src.anglescale = entity->anglescale.value(); src.dirt = entity->dirt.value(); src.falloff = entity->falloff.value(); - g_gpu_direct_sources.push_back(src); + ++raw_sources; + GPU_Direct_AddUniqueSource(unique_keys, src); } for (const sun_t &sun : GetSuns()) { @@ -2835,20 +2976,33 @@ static bool GPU_DirectQueue_BuildSourcesLocked() src.flags = sun.dirt ? 1u : 0u; src.anglescale = sun.anglescale; src.dirt = sun.dirt ? 1.0f : 0.0f; - g_gpu_direct_sources.push_back(src); + ++raw_sources; + GPU_Direct_AddUniqueSource(unique_keys, src); } - logging::print("GPU direct phase: queued {} compatible direct sources.\n", g_gpu_direct_sources.size()); - if (g_gpu_direct_sources.empty()) { - return true; - } + logging::print("GPU direct phase: queued {} compatible direct sources ({} raw, {} deduped).\n", + g_gpu_direct_sources.size(), raw_sources, raw_sources - g_gpu_direct_sources.size()); return true; } +static std::uint64_t GPU_DirectQueue_ImplicitRayCountLocked() +{ + std::uint64_t implicit_rays = 0; + for (const auto &sample : g_gpu_direct_samples) { + const std::size_t face_index = sample.face_index; + if (face_index < g_gpu_direct_face_ranges.size()) { + implicit_rays += g_gpu_direct_face_ranges[face_index].source_count; + } + } + return implicit_rays; +} + static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) { if (g_gpu_direct_samples.empty()) { g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); return true; } @@ -2859,7 +3013,11 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) g_gpu_direct_sources.size(), g_gpu_direct_samples.data(), accum.data(), - g_gpu_direct_samples.size()); + g_gpu_direct_samples.size(), + g_gpu_direct_face_ranges.data(), + g_gpu_direct_face_ranges.size(), + g_gpu_direct_face_source_indices.data(), + g_gpu_direct_face_source_indices.size()); const auto t1 = std::chrono::steady_clock::now(); const double gpu_ms = std::chrono::duration_cast(t1 - t0).count() / 1000.0; @@ -2869,6 +3027,8 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) logging::print("ERROR: disabling GPU direct phase for the rest of this run. Re-run without -gpu for guaranteed CPU output.\n"); g_gpu_direct_samples.clear(); g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); return false; } @@ -2894,12 +3054,14 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) } } - const std::uint64_t implicit_rays = static_cast(g_gpu_direct_samples.size()) * static_cast(g_gpu_direct_sources.size()); - logging::print("GPU direct phase: flushed {} samples x {} sources = {} implicit rays in {:.3f} ms\n", - g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), implicit_rays, gpu_ms); + const std::uint64_t implicit_rays = GPU_DirectQueue_ImplicitRayCountLocked(); + logging::print("GPU direct phase: flushed {} samples, {} unique sources, {} face-source refs = {} implicit rays in {:.3f} ms\n", + g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), g_gpu_direct_face_source_indices.size(), implicit_rays, gpu_ms); g_gpu_direct_samples.clear(); g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); return true; } } // namespace @@ -2931,18 +3093,60 @@ static bool GPU_DirectQueue_AddFace(const mbsp_t *bsp, lightsurf_t *lightsurf, l return true; } + qvec3f mins{std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; + qvec3f maxs{-std::numeric_limits::max(), -std::numeric_limits::max(), -std::numeric_limits::max()}; + qvec3f normal_sum{0, 0, 0}; + std::size_t valid_samples = 0; + for (const auto &sample : lightsurf->samples) { + if (sample.occluded) { + continue; + } + for (int axis = 0; axis < 3; ++axis) { + mins[axis] = std::min(mins[axis], sample.point[axis]); + maxs[axis] = std::max(maxs[axis], sample.point[axis]); + } + normal_sum += sample.normal; + ++valid_samples; + } + if (valid_samples == 0) { + return true; + } + + qvec3f face_normal = lightsurf->snormal; + const float normal_len2 = qv::dot(normal_sum, normal_sum); + if (normal_len2 > 0.0001f) { + face_normal = normal_sum * (1.0f / std::sqrt(normal_len2)); + } + + const std::uint32_t face_index = static_cast(g_gpu_direct_face_ranges.size()); + gpu_light::direct_phase_face_range_t face_range{}; + face_range.source_begin = static_cast(g_gpu_direct_face_source_indices.size()); + + for (std::uint32_t source_index = 0; source_index < g_gpu_direct_sources.size(); ++source_index) { + if (GPU_Direct_SourceAffectsFace(g_gpu_direct_sources[source_index], mins, maxs, face_normal, lightsurf->twosided)) { + g_gpu_direct_face_source_indices.push_back(source_index); + } + } + + face_range.source_count = static_cast(g_gpu_direct_face_source_indices.size()) - face_range.source_begin; + if (face_range.source_count == 0) { + return true; + } + g_gpu_direct_face_ranges.push_back(face_range); + const std::size_t first_sample = g_gpu_direct_samples.size(); g_gpu_direct_faces.push_back(gpu_direct_face_record_t{lightsurf, lightmaps, first_sample, sample_count}); for (const auto &sample : lightsurf->samples) { gpu_light::direct_phase_sample_t s{}; + s.face_index = face_index; if (!sample.occluded) { s.px = sample.point[0]; s.py = sample.point[1]; s.pz = sample.point[2]; s.nx = sample.normal[0]; s.ny = sample.normal[1]; s.nz = sample.normal[2]; s.occlusion = sample.occlusion; s.twosided = lightsurf->twosided ? 1.0f : 0.0f; } else { - s.twosided = -1.0f; // sentinel: shader skips occluded/invalid samples + s.twosided = -1.0f; } g_gpu_direct_samples.push_back(s); } diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc index 71a6e8e8..1f5a1732 100644 --- a/light/trace_gpu.cc +++ b/light/trace_gpu.cc @@ -22,6 +22,10 @@ bool trace_direct_phase_batch( const gpu_light::direct_phase_sample_t *samples, gpu_light::direct_phase_accum_t *accum, std::size_t sample_count, + const gpu_light::direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count, std::string &error); bool trace_direct_accumulate_batch( @@ -145,12 +149,23 @@ bool trace_direct_phase_batch( std::size_t source_count, const direct_phase_sample_t *samples, direct_phase_accum_t *accum, - std::size_t sample_count) { - if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) { + std::size_t sample_count, + const direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count) { + if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0) { return true; } - const std::uint64_t implicit_rays = static_cast(source_count) * static_cast(sample_count); + std::uint64_t implicit_rays = 0; + for (std::size_t i = 0; i < face_range_count; ++i) { + implicit_rays += face_ranges[i].source_count; + } + if (implicit_rays == 0 || face_source_index_count == 0) { + return true; + } + implicit_rays *= static_cast(sample_count) / static_cast(face_range_count); { std::lock_guard lock(g_mutex); g_stats.batches++; @@ -164,7 +179,8 @@ bool trace_direct_phase_batch( #if defined(HAVE_GPU_LIGHT) std::string error; const bool ok = vulkan_backend::trace_direct_phase_batch( - sources, source_count, samples, accum, sample_count, error); + sources, source_count, samples, accum, sample_count, + face_ranges, face_range_count, face_source_indices, face_source_index_count, error); std::lock_guard lock(g_mutex); if (ok) { g_stats.gpu_batches++; diff --git a/light/trace_gpu_vulkan.cc b/light/trace_gpu_vulkan.cc index eda4d928..da78638e 100644 --- a/light/trace_gpu_vulkan.cc +++ b/light/trace_gpu_vulkan.cc @@ -90,6 +90,15 @@ struct gpu_direct_accum_host_t { struct gpu_direct_phase_sample_host_t { float px, py, pz, occlusion; float nx, ny, nz, twosided; + std::uint32_t face_index; + std::uint32_t reserved0; + std::uint32_t reserved1; + std::uint32_t reserved2; +}; + +struct gpu_direct_phase_face_range_host_t { + std::uint32_t source_begin; + std::uint32_t source_count; }; struct gpu_direct_phase_source_host_t { @@ -122,7 +131,8 @@ static_assert(sizeof(gpu_result_host_t) == 20, "GPU result layout must match sha static_assert(sizeof(gpu_direct_job_host_t) == 80, "GPU direct job layout must match shader"); static_assert(sizeof(gpu_direct_range_host_t) == 8, "GPU direct range layout must match shader"); static_assert(sizeof(gpu_direct_accum_host_t) == 48, "GPU direct accum layout must match shader"); -static_assert(sizeof(gpu_direct_phase_sample_host_t) == 32, "GPU direct phase sample layout must match shader"); +static_assert(sizeof(gpu_direct_phase_sample_host_t) == 48, "GPU direct phase sample layout must match shader"); +static_assert(sizeof(gpu_direct_phase_face_range_host_t) == 8, "GPU direct phase face range layout must match shader"); static_assert(sizeof(gpu_direct_phase_source_host_t) == 80, "GPU direct phase source layout must match shader"); struct context_t { @@ -793,7 +803,19 @@ static bool create_direct_pipeline(std::string &error) { b3.descriptorCount = 1; b3.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - std::array bindings{b0, b1, b2, b3}; + VkDescriptorSetLayoutBinding b4{}; + b4.binding = 4; + b4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b4.descriptorCount = 1; + b4.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b5{}; + b5.binding = 5; + b5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b5.descriptorCount = 1; + b5.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + std::array bindings{b0, b1, b2, b3, b4, b5}; VkDescriptorSetLayoutCreateInfo dlci{}; dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; dlci.bindingCount = static_cast(bindings.size()); @@ -840,7 +862,7 @@ static bool create_direct_pipeline(std::string &error) { ps0.descriptorCount = 1; VkDescriptorPoolSize ps1{}; ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - ps1.descriptorCount = 3; + ps1.descriptorCount = 5; std::array sizes{ps0, ps1}; VkDescriptorPoolCreateInfo dpci{}; @@ -860,7 +882,12 @@ static bool create_direct_pipeline(std::string &error) { return true; } -static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffer_t &range_buffer, const buffer_t &accum_buffer) { +static void update_direct_descriptor_set( + const buffer_t &sample_buffer, + const buffer_t &source_buffer, + const buffer_t &face_range_buffer, + const buffer_t &face_source_index_buffer, + const buffer_t &accum_buffer) { VkWriteDescriptorSetAccelerationStructureKHR as_info{}; as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; as_info.accelerationStructureCount = 1; @@ -874,43 +901,67 @@ static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffe w0.descriptorCount = 1; w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; - VkDescriptorBufferInfo job_info{}; - job_info.buffer = job_buffer.buffer; - job_info.offset = 0; - job_info.range = job_buffer.size; + VkDescriptorBufferInfo sample_info{}; + sample_info.buffer = sample_buffer.buffer; + sample_info.offset = 0; + sample_info.range = sample_buffer.size; VkWriteDescriptorSet w1{}; w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; w1.dstSet = g.direct_descriptor_set; w1.dstBinding = 1; w1.descriptorCount = 1; w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - w1.pBufferInfo = &job_info; + w1.pBufferInfo = &sample_info; - VkDescriptorBufferInfo range_info{}; - range_info.buffer = range_buffer.buffer; - range_info.offset = 0; - range_info.range = range_buffer.size; + VkDescriptorBufferInfo source_info{}; + source_info.buffer = source_buffer.buffer; + source_info.offset = 0; + source_info.range = source_buffer.size; VkWriteDescriptorSet w2{}; w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; w2.dstSet = g.direct_descriptor_set; w2.dstBinding = 2; w2.descriptorCount = 1; w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - w2.pBufferInfo = &range_info; + w2.pBufferInfo = &source_info; - VkDescriptorBufferInfo accum_info{}; - accum_info.buffer = accum_buffer.buffer; - accum_info.offset = 0; - accum_info.range = accum_buffer.size; + VkDescriptorBufferInfo face_range_info{}; + face_range_info.buffer = face_range_buffer.buffer; + face_range_info.offset = 0; + face_range_info.range = face_range_buffer.size; VkWriteDescriptorSet w3{}; w3.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; w3.dstSet = g.direct_descriptor_set; w3.dstBinding = 3; w3.descriptorCount = 1; w3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - w3.pBufferInfo = &accum_info; + w3.pBufferInfo = &face_range_info; + + VkDescriptorBufferInfo face_source_index_info{}; + face_source_index_info.buffer = face_source_index_buffer.buffer; + face_source_index_info.offset = 0; + face_source_index_info.range = face_source_index_buffer.size; + VkWriteDescriptorSet w4{}; + w4.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w4.dstSet = g.direct_descriptor_set; + w4.dstBinding = 4; + w4.descriptorCount = 1; + w4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w4.pBufferInfo = &face_source_index_info; - std::array writes{w0, w1, w2, w3}; + VkDescriptorBufferInfo accum_info{}; + accum_info.buffer = accum_buffer.buffer; + accum_info.offset = 0; + accum_info.range = accum_buffer.size; + VkWriteDescriptorSet w5{}; + w5.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w5.dstSet = g.direct_descriptor_set; + w5.dstBinding = 5; + w5.descriptorCount = 1; + w5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w5.pBufferInfo = &accum_info; + + std::array writes{w0, w1, w2, w3, w4, w5}; vkUpdateDescriptorSets(g.device, static_cast(writes.size()), writes.data(), 0, nullptr); } @@ -1089,6 +1140,10 @@ bool trace_direct_phase_batch( const gpu_light::direct_phase_sample_t *samples, gpu_light::direct_phase_accum_t *accum, std::size_t sample_count, + const gpu_light::direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count, std::string &error) { std::lock_guard lock(g_mutex); if (!g.device || !g.direct_pipeline || !g.tlas.as) { @@ -1098,7 +1153,7 @@ bool trace_direct_phase_batch( if (g.has_filtered_embree_geometry) { return false; } - if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) { + if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0 || face_source_index_count == 0) { return true; } @@ -1112,6 +1167,10 @@ bool trace_direct_phase_batch( gpu_samples[i].ny = samples[i].ny; gpu_samples[i].nz = samples[i].nz; gpu_samples[i].twosided = samples[i].twosided; + gpu_samples[i].face_index = samples[i].face_index; + gpu_samples[i].reserved0 = 0; + gpu_samples[i].reserved1 = 0; + gpu_samples[i].reserved2 = 0; } std::vector gpu_sources(source_count); @@ -1138,10 +1197,21 @@ bool trace_direct_phase_batch( gpu_sources[i].pad0 = 0.0f; } + std::vector gpu_face_ranges(face_range_count); + for (std::size_t i = 0; i < face_range_count; ++i) { + gpu_face_ranges[i].source_begin = face_ranges[i].source_begin; + gpu_face_ranges[i].source_count = face_ranges[i].source_count; + } + + std::vector gpu_face_source_indices(face_source_index_count); + std::memcpy(gpu_face_source_indices.data(), face_source_indices, sizeof(std::uint32_t) * face_source_index_count); + std::vector zero_accum(sample_count); buffer_t sample_buffer; buffer_t source_buffer; + buffer_t face_range_buffer; + buffer_t face_source_index_buffer; buffer_t accum_buffer; bool ok = create_buffer(sizeof(gpu_direct_phase_sample_host_t) * sample_count, @@ -1163,6 +1233,31 @@ bool trace_direct_phase_batch( return false; } + ok = create_buffer(sizeof(gpu_direct_phase_face_range_host_t) * face_range_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + face_range_buffer, + error, + gpu_face_ranges.data()); + if (!ok) { + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + + ok = create_buffer(sizeof(std::uint32_t) * face_source_index_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + face_source_index_buffer, + error, + gpu_face_source_indices.data()); + if (!ok) { + destroy_buffer(face_range_buffer); + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + ok = create_buffer(sizeof(gpu_direct_accum_host_t) * sample_count, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, @@ -1170,12 +1265,14 @@ bool trace_direct_phase_batch( error, zero_accum.data()); if (!ok) { + destroy_buffer(face_source_index_buffer); + destroy_buffer(face_range_buffer); destroy_buffer(source_buffer); destroy_buffer(sample_buffer); return false; } - update_direct_descriptor_set(sample_buffer, source_buffer, accum_buffer); + update_direct_descriptor_set(sample_buffer, source_buffer, face_range_buffer, face_source_index_buffer, accum_buffer); direct_push_constants_t pc{}; pc.sample_count = static_cast(sample_count); @@ -1214,6 +1311,8 @@ bool trace_direct_phase_batch( } destroy_buffer(accum_buffer); + destroy_buffer(face_source_index_buffer); + destroy_buffer(face_range_buffer); destroy_buffer(source_buffer); destroy_buffer(sample_buffer); return ok; From 0d9f25ed79ba375a6e8c85cddeb8c0ad52bcdfde Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 04:19:25 +0900 Subject: [PATCH 4/7] whitespace --- light/ltface.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/light/ltface.cc b/light/ltface.cc index baddf249..27fd406d 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -2751,8 +2751,6 @@ static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, light - - #if defined(HAVE_GPU_LIGHT) namespace { struct gpu_direct_face_record_t { From 4e1387f6c06437c356cd55305c1b141e548e9ade Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 04:34:45 +0900 Subject: [PATCH 5/7] Remove unused GPU light processing code from ltface.cc and adjust shader ray direction normalization in direct_phase.comp for improved performance. --- light/gpu_shaders/direct_phase.comp | 2 +- light/ltface.cc | 267 ++++++---------------------- 2 files changed, 60 insertions(+), 209 deletions(-) diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp index 43dd6b51..79b7e167 100644 --- a/light/gpu_shaders/direct_phase.comp +++ b/light/gpu_shaders/direct_phase.comp @@ -149,7 +149,7 @@ void main() { vec3 ncontrib_dir; if (l.type == 1u) { - ray_dir = normalize(l.direction); + ray_dir = l.direction; ray_dist = l.dist; float angle = dot(ray_dir, s.normal); if (s.twosided > 0.5 && angle < 0.0) angle = -angle; diff --git a/light/ltface.cc b/light/ltface.cc index 27fd406d..fa3401e7 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -2563,194 +2563,6 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const -#if defined(HAVE_GPU_LIGHT) -static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps) -{ - - return false; - - if (!GPU_TraceAvailable()) { - return false; - } - - constexpr std::size_t GPU_DIRECT_MIN_JOBS = 32768; - - const settings::worldspawn_keys &cfg = *lightsurf->cfg; - const modelinfo_t *modelinfo = lightsurf->modelinfo; - const qplane3f &plane = lightsurf->plane; - const std::size_t sample_count = lightsurf->samples.size(); - if (!sample_count) { - return true; - } - - std::vector> per_sample(sample_count); - - auto add_job = [&](int sample_index, const qvec3f &origin, const qvec3f &direction, float dist, const qvec3f &color, const qvec3f &normalcontrib) { - gpu_light::direct_job_t job{}; - job.ox = origin[0]; - job.oy = origin[1]; - job.oz = origin[2]; - job.tmin = 0.01f; - job.dx = direction[0]; - job.dy = direction[1]; - job.dz = direction[2]; - job.tmax = dist; - job.cr = color[0]; - job.cg = color[1]; - job.cb = color[2]; - job.nr = normalcontrib[0]; - job.ng = normalcontrib[1]; - job.nb = normalcontrib[2]; - job.sample_index = static_cast(sample_index); - per_sample[static_cast(sample_index)].push_back(job); - }; - - // Entity lights. This fast path is deliberately style-0/default-channel only. - for (const auto &entity_ptr : GetLights()) { - const light_t *entity = entity_ptr.get(); - if (entity->getFormula() == LF_LOCALMIN) continue; - if (entity->nostaticlight.value()) continue; - if (entity->light.value() <= 0) continue; - - if (entity->style.value() != 0) return false; - if (entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false; - if (entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false; - - if (light_options.visapprox.value() == visapprox_t::VIS && - entity->light_channel_mask.value() == CHANNEL_MASK_DEFAULT && - entity->shadow_channel_mask.value() == CHANNEL_MASK_DEFAULT && - VisCullEntity(bsp, lightsurf->pvs, entity->leaf)) { - continue; - } - - const float planedist = plane.distance_to(entity->origin.value()); - if (planedist < 0 && !entity->bleed.value() && !lightsurf->curved && !lightsurf->twosided) { - continue; - } - if (CullLight(entity, lightsurf)) { - continue; - } - if (!(entity->light_channel_mask.value() & lightsurf->object_channel_mask)) { - continue; - } - - for (int i = 0; i < static_cast(lightsurf->samples.size()); i++) { - const auto &sample = lightsurf->samples[i]; - if (sample.occluded) continue; - - const qvec3f &surfpoint = sample.point; - const qvec3f &surfnorm = sample.normal; - qvec3f surfpointToLightDir; - float surfpointToLightDist; - qvec3f color; - qvec3f normalcontrib; - GetLightContrib(cfg, entity, surfnorm, true, surfpoint, lightsurf->twosided, color, surfpointToLightDir, normalcontrib, &surfpointToLightDist); - const float occlusion = Dirt_GetScaleFactor(cfg, sample.occlusion, entity, surfpointToLightDist, lightsurf); - color *= occlusion; - if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) { - continue; - } - add_job(i, surfpoint, surfpointToLightDir, surfpointToLightDist, color, normalcontrib); - } - } - - // Sunlight. The GPU AS contains opaque solids only, so a miss is treated as visible sky. - // Sun texture filtering and non-zero styles stay on the CPU path. - for (const sun_t &sun : GetSuns()) { - if (sun.sunlight <= 0) continue; - if (sun.style != 0) return false; - if (sun.suntexture_value) return false; - - qvec3f incoming = qv::normalize(sun.sunvec); - const float dp = qv::dot(incoming, plane.normal); - if (dp < -LIGHT_ANGLE_EPSILON && !lightsurf->curved && !lightsurf->twosided) { - continue; - } - if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) { - continue; - } - - for (int i = 0; i < static_cast(lightsurf->samples.size()); i++) { - const auto &sample = lightsurf->samples[i]; - if (sample.occluded) continue; - - const qvec3f &surfpoint = sample.point; - const qvec3f &surfnorm = sample.normal; - float angle = qv::dot(incoming, surfnorm); - if (lightsurf->twosided && angle < 0) { - angle = -angle; - } - angle = std::max(0.0f, angle); - angle = (1.0f - sun.anglescale) + sun.anglescale * angle; - float value = angle * sun.sunlight; - if (sun.dirt) { - value *= Dirt_GetScaleFactor(cfg, sample.occlusion, NULL, 0.0f, lightsurf); - } - qvec3f color = sun.sunlight_color * (value / 255.0f); - if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) { - continue; - } - qvec3f normalcontrib = incoming * value; - add_job(i, surfpoint, incoming, MAX_SKY_DIST, color, normalcontrib); - } - } - - std::size_t job_count = 0; - for (const auto &v : per_sample) { - job_count += v.size(); - } - if (job_count == 0) { - return true; - } - if (job_count < GPU_DIRECT_MIN_JOBS) { - return false; - } - - std::vector jobs; - std::vector ranges(sample_count); - jobs.reserve(job_count); - for (std::size_t i = 0; i < sample_count; ++i) { - ranges[i].first = static_cast(jobs.size()); - ranges[i].count = static_cast(per_sample[i].size()); - jobs.insert(jobs.end(), per_sample[i].begin(), per_sample[i].end()); - } - - std::vector accum(sample_count); - if (!gpu_light::trace_direct_accumulate_batch( - modelinfo, - CHANNEL_MASK_DEFAULT, - jobs.data(), - jobs.size(), - ranges.data(), - accum.data(), - sample_count)) { - return false; - } - - lightmap_t *lightmap = Lightmap_ForStyle(lightmaps, 0, lightsurf); - bool hit = false; - for (std::size_t i = 0; i < sample_count; ++i) { - if (!accum[i].hit) continue; - const qvec3f color{accum[i].cr, accum[i].cg, accum[i].cb}; - const qvec3f normalcontrib{accum[i].nr, accum[i].ng, accum[i].nb}; - lightsample_t &sample = lightmap->samples[i]; - sample.color += color; - sample.direction += normalcontrib; - lightmap->bounce_color += color; - hit = true; - } - if (hit) { - Lightmap_Save(bsp, lightmaps, lightsurf, lightmap, 0); - } - return true; -} -#endif - - - - - - #if defined(HAVE_GPU_LIGHT) namespace { struct gpu_direct_face_record_t { @@ -2777,10 +2589,14 @@ struct gpu_direct_source_key_t { std::uint32_t flags = 0; int px = 0, py = 0, pz = 0; int dx = 0, dy = 0, dz = 0; - int cr = 0, cg = 0, cb = 0; - int light = 0, atten = 0, anglescale = 0, falloff = 0; + int atten = 0, anglescale = 0, falloff = 0; }; +// lower values merge more nearby sun rays into one representative ray. +// This preserves approximate energy by accumulating light/color into the merged source. +// Raise to 64/128 for quality, lower to 16/8 for speed. +static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 32.0f; + static int GPU_Direct_Quantize(float v, float scale = 4096.0f) { return static_cast(std::lround(v * scale)); @@ -2795,13 +2611,10 @@ static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phas k.px = GPU_Direct_Quantize(s.px); k.py = GPU_Direct_Quantize(s.py); k.pz = GPU_Direct_Quantize(s.pz); - k.dx = GPU_Direct_Quantize(s.dx); - k.dy = GPU_Direct_Quantize(s.dy); - k.dz = GPU_Direct_Quantize(s.dz); - k.cr = GPU_Direct_Quantize(s.cr); - k.cg = GPU_Direct_Quantize(s.cg); - k.cb = GPU_Direct_Quantize(s.cb); - k.light = GPU_Direct_Quantize(s.light, 1024.0f); + const float dir_scale = (s.type == 1u) ? GPU_DIRECT_SUN_DIR_MERGE_SCALE : 4096.0f; + k.dx = GPU_Direct_Quantize(s.dx, dir_scale); + k.dy = GPU_Direct_Quantize(s.dy, dir_scale); + k.dz = GPU_Direct_Quantize(s.dz, dir_scale); k.atten = GPU_Direct_Quantize(s.atten, 1024.0f); k.anglescale = GPU_Direct_Quantize(s.anglescale, 1024.0f); k.falloff = GPU_Direct_Quantize(s.falloff, 1024.0f); @@ -2813,18 +2626,44 @@ static bool GPU_Direct_SourceKeyEquals(const gpu_direct_source_key_t &a, const g return a.type == b.type && a.formula == b.formula && a.flags == b.flags && a.px == b.px && a.py == b.py && a.pz == b.pz && a.dx == b.dx && a.dy == b.dy && a.dz == b.dz && - a.cr == b.cr && a.cg == b.cg && a.cb == b.cb && - a.light == b.light && a.atten == b.atten && - a.anglescale == b.anglescale && a.falloff == b.falloff; + a.atten == b.atten && a.anglescale == b.anglescale && a.falloff == b.falloff; } -static void GPU_Direct_AddUniqueSource( +static void GPU_Direct_MergeInto(gpu_light::direct_phase_source_t &dst, const gpu_light::direct_phase_source_t &src) +{ + const float a = std::max(dst.light, 0.0f); + const float b = std::max(src.light, 0.0f); + const float total = a + b; + if (total <= 0.0f) { + return; + } + + dst.cr = (dst.cr * a + src.cr * b) / total; + dst.cg = (dst.cg * a + src.cg * b) / total; + dst.cb = (dst.cb * a + src.cb * b) / total; + + if (dst.type == 1u) { + qvec3f d{dst.dx * a + src.dx * b, dst.dy * a + src.dy * b, dst.dz * a + src.dz * b}; + const float len2 = qv::dot(d, d); + if (len2 > 0.0001f) { + d = d * (1.0f / std::sqrt(len2)); + dst.dx = d[0]; + dst.dy = d[1]; + dst.dz = d[2]; + } + } + + dst.light = total; +} + +static void GPU_Direct_AddMergedSource( std::vector &keys, const gpu_light::direct_phase_source_t &src) { const auto key = GPU_Direct_SourceKey(src); - for (const auto &existing : keys) { - if (GPU_Direct_SourceKeyEquals(existing, key)) { + for (std::size_t i = 0; i < keys.size(); ++i) { + if (GPU_Direct_SourceKeyEquals(keys[i], key)) { + GPU_Direct_MergeInto(g_gpu_direct_sources[i], src); return; } } @@ -2883,7 +2722,7 @@ static bool GPU_Direct_SourceAffectsFace( if (src.type == 1) { const qvec3f dir{src.dx, src.dy, src.dz}; - return qv::dot(normal, dir) > -0.05f; + return qv::dot(normal, dir) > -0.01f; } const float radius = GPU_Direct_EffectivePointRadius(src); @@ -2921,6 +2760,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked() std::vector unique_keys; std::size_t raw_sources = 0; + std::size_t raw_point_sources = 0; + std::size_t raw_sun_sources = 0; for (const auto &entity_ptr : GetLights()) { const light_t *entity = entity_ptr.get(); if (entity->nostaticlight.value()) continue; @@ -2952,7 +2793,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked() src.dirt = entity->dirt.value(); src.falloff = entity->falloff.value(); ++raw_sources; - GPU_Direct_AddUniqueSource(unique_keys, src); + ++raw_point_sources; + GPU_Direct_AddMergedSource(unique_keys, src); } for (const sun_t &sun : GetSuns()) { @@ -2975,11 +2817,20 @@ static bool GPU_DirectQueue_BuildSourcesLocked() src.anglescale = sun.anglescale; src.dirt = sun.dirt ? 1.0f : 0.0f; ++raw_sources; - GPU_Direct_AddUniqueSource(unique_keys, src); + ++raw_sun_sources; + GPU_Direct_AddMergedSource(unique_keys, src); + } + + std::size_t merged_point_sources = 0; + std::size_t merged_sun_sources = 0; + for (const auto &src : g_gpu_direct_sources) { + if (src.type == 1u) ++merged_sun_sources; + else ++merged_point_sources; } - logging::print("GPU direct phase: queued {} compatible direct sources ({} raw, {} deduped).\n", - g_gpu_direct_sources.size(), raw_sources, raw_sources - g_gpu_direct_sources.size()); + logging::print("GPU direct phase: queued {} merged direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge scale {}).\n", + g_gpu_direct_sources.size(), raw_sources, raw_point_sources, raw_sun_sources, + merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), GPU_DIRECT_SUN_DIR_MERGE_SCALE); return true; } From 5a3885e9ac5bccd57b52b980a0305ee1f33487fe Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 04:37:55 +0900 Subject: [PATCH 6/7] Update GPU_DIRECT_SUN_DIR_MERGE_SCALE for improved quality in light processing --- light/ltface.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/light/ltface.cc b/light/ltface.cc index fa3401e7..31f87183 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -2594,8 +2594,8 @@ struct gpu_direct_source_key_t { // lower values merge more nearby sun rays into one representative ray. // This preserves approximate energy by accumulating light/color into the merged source. -// Raise to 64/128 for quality, lower to 16/8 for speed. -static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 32.0f; +// Raise to 512~4096 for quality, lower to 16/8 for speed. +static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 512.0f; static int GPU_Direct_Quantize(float v, float scale = 4096.0f) { From 5091dc811bb30cfe964478e2c0d3360f7f06517c Mon Sep 17 00:00:00 2001 From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com> Date: Sat, 20 Jun 2026 11:03:09 +0900 Subject: [PATCH 7/7] Add GPU sun merging and source culling options - Introduced `gpusunmerge` and `gpusunmergequality` settings for approximate merging of nearby GPU sun jitter rays. - Added `gpusourcecull` and `gpusourcecullquality` settings to enable and control approximate GPU per-face source culling. - Updated light processing logic to utilize new settings for improved performance and quality in light rendering. --- include/light/light.hh | 4 ++ light/light.cc | 7 ++- light/ltface.cc | 100 ++++++++++++++++++++++++++++++++--------- 3 files changed, 90 insertions(+), 21 deletions(-) diff --git a/include/light/light.hh b/include/light/light.hh index 4b2852a6..177cf5f6 100644 --- a/include/light/light.hh +++ b/include/light/light.hh @@ -396,6 +396,10 @@ public: setting_bool novanilla; setting_scalar gate; setting_int32 sunsamples; + setting_bool gpusunmerge; // -gpusunmerge: approximate-merge nearby GPU sun jitter rays + setting_scalar gpusunmergequality; // -gpusunmergequality: 0 fast/rough, 1 slow/high quality + setting_bool gpusourcecull; // -gpusourcecull: use approximate GPU per-face source culling + setting_scalar gpusourcecullquality; // -gpusourcecullquality: 0 fast/aggressive, 1 safest/conservative settings::setting_bool gpu; // -gpu: use Vulkan GPU ray-query backend when available setting_bool arghradcompat; setting_bool nolighting; diff --git a/light/light.cc b/light/light.cc index 55ff4709..b20e4d3d 100644 --- a/light/light.cc +++ b/light/light.cc @@ -292,7 +292,12 @@ light_settings::light_settings() write_normals{this, "wrnormals", false, &output_group, "output normals, tangents and bitangents in a BSPX lump"}, novanilla{this, "novanilla", false, &experimental_group, "implies -bspxlit; don't write vanilla lighting"}, gate{this, "gate", LIGHT_EQUAL_EPSILON, &performance_group, "cutoff lights at this brightness level"}, - sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"}, + sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, + gpusunmerge{this, "gpusunmerge", false, &performance_group, "approximate-merge nearby GPU sun jitter rays"}, + gpusunmergequality{this, "gpusunmergequality", 0.75f, 0.0f, 1.0f, &performance_group, "GPU sun merge quality: 0 fast/rough, 1 slow/high quality"}, + gpusourcecull{this, "gpusourcecull", false, &performance_group, "use approximate GPU per-face source culling"}, + gpusourcecullquality{this, "gpusourcecullquality", 1.0f, 0.0f, 1.0f, &performance_group, "GPU source culling quality: 0 fast/aggressive, 1 safest/conservative"}, + gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"}, arghradcompat{this, "arghradcompat", false, &output_group, "enable compatibility for Arghrad-specific keys"}, nolighting{this, "nolighting", false, &output_group, "don't output main world lighting (Q2RTX)"}, debugface{this, "debugface", std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), diff --git a/light/ltface.cc b/light/ltface.cc index 31f87183..649312ed 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -2592,10 +2592,45 @@ struct gpu_direct_source_key_t { int atten = 0, anglescale = 0, falloff = 0; }; -// lower values merge more nearby sun rays into one representative ray. -// This preserves approximate energy by accumulating light/color into the merged source. -// Raise to 512~4096 for quality, lower to 16/8 for speed. -static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 512.0f; +// Optional approximate sun-direction merge. Disabled by default for final quality. +// When enabled with -gpusunmerge, -gpusunmergequality maps to a direction quantization scale: +// 0.00 => 16 fastest/roughest +// 0.50 => 256 balanced preview +// 1.00 => 4096 best quality/least merging +static float GPU_Direct_SunMergeQuality() +{ + float q = light_options.gpusunmergequality.value(); + if (!std::isfinite(q)) { + q = 0.75f; + } + if (q < 0.0f) q = 0.0f; + if (q > 1.0f) q = 1.0f; + return q; +} + +static float GPU_Direct_SunMergeScale() +{ + if (!light_options.gpusunmerge.value()) { + return 65536.0f; // effectively exact; preserves final-quality sun jitter + } + return 16.0f * std::pow(256.0f, GPU_Direct_SunMergeQuality()); +} + +static float GPU_Direct_SourceCullQuality() +{ + float q = light_options.gpusourcecullquality.value(); + if (!std::isfinite(q)) { + q = 1.0f; + } + if (q < 0.0f) q = 0.0f; + if (q > 1.0f) q = 1.0f; + return q; +} + +static bool GPU_Direct_SourceCullEnabled() +{ + return light_options.gpusourcecull.value(); +} static int GPU_Direct_Quantize(float v, float scale = 4096.0f) { @@ -2611,7 +2646,7 @@ static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phas k.px = GPU_Direct_Quantize(s.px); k.py = GPU_Direct_Quantize(s.py); k.pz = GPU_Direct_Quantize(s.pz); - const float dir_scale = (s.type == 1u) ? GPU_DIRECT_SUN_DIR_MERGE_SCALE : 4096.0f; + const float dir_scale = (s.type == 1u) ? GPU_Direct_SunMergeScale() : 4096.0f; k.dx = GPU_Direct_Quantize(s.dx, dir_scale); k.dy = GPU_Direct_Quantize(s.dy, dir_scale); k.dz = GPU_Direct_Quantize(s.dz, dir_scale); @@ -2716,34 +2751,52 @@ static bool GPU_Direct_SourceAffectsFace( const qvec3f &normal, bool twosided) { + if (!GPU_Direct_SourceCullEnabled()) { + return true; + } if (twosided) { return true; } + const float quality = GPU_Direct_SourceCullQuality(); + if (src.type == 1) { + // Sun normal culling is the quality-sensitive part. At max quality we keep + // all sun jitter directions for every face. Lower quality progressively + // removes back-facing sun directions. + if (quality >= 0.999f) { + return true; + } const qvec3f dir{src.dx, src.dy, src.dz}; - return qv::dot(normal, dir) > -0.01f; + const float threshold = -0.50f + (0.55f * (1.0f - quality)); // q=0 -> 0.05, q=1 -> -0.50 + return qv::dot(normal, dir) > threshold; } const float radius = GPU_Direct_EffectivePointRadius(src); if (radius < static_cast(MAX_SKY_DIST) * 0.999f) { + // More quality = more radius padding = less chance of missing a faint edge case. + const float padded_radius = radius * (1.0f + 3.0f * quality) + 256.0f * quality; const float d2 = GPU_Direct_PointAABBDistance2(src, mins, maxs); - if (d2 > radius * radius) { + if (d2 > padded_radius * padded_radius) { return false; } } - // Conservative face-normal cull for point lights: use vector from face center to light. - const qvec3f center{ - (mins[0] + maxs[0]) * 0.5f, - (mins[1] + maxs[1]) * 0.5f, - (mins[2] + maxs[2]) * 0.5f}; - qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]}; - const float to_light_len2 = qv::dot(to_light, to_light); - if (to_light_len2 > 0.0001f) { - to_light = to_light * (1.0f / std::sqrt(to_light_len2)); - if (qv::dot(normal, to_light) <= -0.10f) { - return false; + // Conservative face-normal cull for point lights. At max quality this is disabled; + // lower quality allows removing back-facing points. + if (quality < 0.999f) { + const qvec3f center{ + (mins[0] + maxs[0]) * 0.5f, + (mins[1] + maxs[1]) * 0.5f, + (mins[2] + maxs[2]) * 0.5f}; + qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]}; + const float to_light_len2 = qv::dot(to_light, to_light); + if (to_light_len2 > 0.0001f) { + to_light = to_light * (1.0f / std::sqrt(to_light_len2)); + const float threshold = -0.75f + (0.65f * (1.0f - quality)); // q=0 -> -0.10, q=1 -> -0.75 + if (qv::dot(normal, to_light) <= threshold) { + return false; + } } } @@ -2828,9 +2881,16 @@ static bool GPU_DirectQueue_BuildSourcesLocked() else ++merged_point_sources; } - logging::print("GPU direct phase: queued {} merged direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge scale {}).\n", + const bool sun_merge_enabled = light_options.gpusunmerge.value(); + const float sun_merge_quality = GPU_Direct_SunMergeQuality(); + const float sun_merge_scale = GPU_Direct_SunMergeScale(); + const bool source_cull_enabled = GPU_Direct_SourceCullEnabled(); + const float source_cull_quality = GPU_Direct_SourceCullQuality(); + logging::print("GPU direct phase: queued {} direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge {}; quality {:.2f}; scale {:.1f}; source cull {}; quality {:.2f}).\n", g_gpu_direct_sources.size(), raw_sources, raw_point_sources, raw_sun_sources, - merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), GPU_DIRECT_SUN_DIR_MERGE_SCALE); + merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), + sun_merge_enabled ? "on" : "off", sun_merge_quality, sun_merge_scale, + source_cull_enabled ? "on" : "off", source_cull_quality); return true; }