From ee2c18cc998b27249bc297a22823c28ced64e230 Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 03:54:31 +0900
Subject: [PATCH 1/7] gpu vulkan processing for lighting

---
 include/light/light.hh              |    1 +
 include/light/trace_embree.hh       |   47 +
 include/light/trace_gpu.hh          |  140 +++
 light/CMakeLists.txt                |   34 +-
 light/gpu_shaders/direct_phase.comp |  180 ++++
 light/gpu_shaders/occlusion.comp    |   59 ++
 light/light.cc                      |   22 +-
 light/ltface.cc                     |  406 ++++++++-
 light/trace_gpu.cc                  |  233 +++++
 light/trace_gpu_vulkan.cc           | 1244 +++++++++++++++++++++++++++
 10 files changed, 2362 insertions(+), 4 deletions(-)
 create mode 100644 include/light/trace_gpu.hh
 create mode 100644 light/gpu_shaders/direct_phase.comp
 create mode 100644 light/gpu_shaders/occlusion.comp
 create mode 100644 light/trace_gpu.cc
 create mode 100644 light/trace_gpu_vulkan.cc

diff --git a/include/light/light.hh b/include/light/light.hh
index 0e3cb7f5..4b2852a6 100644
--- a/include/light/light.hh
+++ b/include/light/light.hh
@@ -396,6 +396,7 @@ public:
     setting_bool novanilla;
     setting_scalar gate;
     setting_int32 sunsamples;
+    settings::setting_bool gpu; // -gpu: use Vulkan GPU ray-query backend when available
     setting_bool arghradcompat;
     setting_bool nolighting;
     setting_vec3 debugface;
diff --git a/include/light/trace_embree.hh b/include/light/trace_embree.hh
index 3f4856c5..997bc404 100644
--- a/include/light/trace_embree.hh
+++ b/include/light/trace_embree.hh
@@ -18,6 +18,7 @@
  */
 
 #pragma once
+#include <light/trace_gpu.hh>
 
 #include <common/aligned_allocator.hh>
 #include <common/qvec.hh>
@@ -280,6 +281,52 @@ public:
         if (!_rays.size())
             return;
 
+#if defined(HAVE_GPU_LIGHT)
+        // Optional large-batch occlusion path. v5 direct lighting uses
+        // direct_phase.comp; small fallback raystreams stay on Embree.
+        constexpr size_t GPU_OCCLUSION_MIN_BATCH = 262144;
+
+        if (_rays.size() >= GPU_OCCLUSION_MIN_BATCH && GPU_TraceAvailable()) {
+            std::vector<gpu_light::ray_t> gpu_rays;
+            std::vector<gpu_light::occlusion_result_t> gpu_results;
+
+            gpu_rays.resize(_rays.size());
+            gpu_results.resize(_rays.size());
+
+            for (size_t i = 0; i < _rays.size(); ++i) {
+                const auto &src = _rays[i].ray.ray;
+                auto &dst = gpu_rays[i];
+
+                dst.origin[0] = src.org_x;
+                dst.origin[1] = src.org_y;
+                dst.origin[2] = src.org_z;
+                dst.tmin = src.tnear;
+
+                dst.direction[0] = src.dir_x;
+                dst.direction[1] = src.dir_y;
+                dst.direction[2] = src.dir_z;
+                dst.tmax = src.tfar;
+
+                dst.shadow_mask = static_cast<std::uint32_t>(shadowmask);
+                dst.user_index = static_cast<std::uint32_t>(i);
+            }
+
+            if (gpu_light::trace_occlusion_batch(
+                    self,
+                    static_cast<std::uint32_t>(shadowmask),
+                    gpu_rays.data(),
+                    gpu_results.data(),
+                    gpu_rays.size())) {
+                for (size_t i = 0; i < _rays.size(); ++i) {
+                    if (gpu_results[i].occluded) {
+                        _rays[i].ray.ray.tfar = -std::abs(_rays[i].ray.ray.tfar);
+                    }
+                }
+                return;
+            }
+        }
+#endif
+
         ray_source_info ctx2(this, self, shadowmask);
         RTCOccludedArguments embree4_args = ctx2.setup_occluded_arguments();
         for (auto &ray : _rays)
diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh
new file mode 100644
index 00000000..a0ecfb51
--- /dev/null
+++ b/include/light/trace_gpu.hh
@@ -0,0 +1,140 @@
+/* GPU trace backend
+ * Prototype overlay generated for Linux/Vulkan ray-query development.
+ */
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+#include <vector>
+
+struct mbsp_t;
+class modelinfo_t;
+
+#ifndef HAVE_GPU_LIGHT
+#define GPU_LIGHT_COMPILED 0
+#else
+#define GPU_LIGHT_COMPILED 1
+#endif
+
+namespace gpu_light {
+
+struct ray_t {
+    float origin[3] = {0, 0, 0};
+    float tmin = 0.01f;
+    float direction[3] = {0, 0, 1};
+    float tmax = 0.0f;
+    std::uint32_t shadow_mask = 0xffffffffu;
+    std::uint32_t user_index = 0;
+};
+
+struct occlusion_result_t {
+    std::uint32_t occluded = 0;
+    std::uint32_t reserved0 = 0;
+    float transmittance[3] = {1.0f, 1.0f, 1.0f};
+};
+
+struct direct_job_t {
+    float ox = 0, oy = 0, oz = 0, tmin = 0.01f;
+    float dx = 0, dy = 0, dz = 1, tmax = 0.0f;
+    float cr = 0, cg = 0, cb = 0, pad0 = 0;
+    float nr = 0, ng = 0, nb = 0, pad1 = 0;
+    std::uint32_t sample_index = 0;
+    std::uint32_t flags = 0;
+    std::uint32_t reserved0 = 0;
+    std::uint32_t reserved1 = 0;
+};
+
+struct direct_sample_range_t {
+    std::uint32_t first = 0;
+    std::uint32_t count = 0;
+};
+
+struct direct_accum_t {
+    float cr = 0, cg = 0, cb = 0, pad0 = 0;
+    float nr = 0, ng = 0, nb = 0, pad1 = 0;
+    std::uint32_t hit = 0;
+    std::uint32_t reserved0 = 0;
+    std::uint32_t reserved1 = 0;
+    std::uint32_t reserved2 = 0;
+};
+
+struct direct_phase_sample_t {
+    float px = 0, py = 0, pz = 0, occlusion = 1;
+    float nx = 0, ny = 0, nz = 1, twosided = 0;
+};
+
+struct direct_phase_source_t {
+    float px = 0, py = 0, pz = 0, light = 0;
+    float dx = 0, dy = 0, dz = 1, dist = 65536.0f;
+    float cr = 1, cg = 1, cb = 1, atten = 1;
+    std::uint32_t type = 0;      // 0 = point, 1 = sun
+    std::uint32_t formula = 0;   // light_formula_t for point lights
+    std::uint32_t flags = 0;     // bit 0: dirt
+    std::uint32_t reserved0 = 0;
+    float anglescale = 1;
+    float dirt = 0;
+    float falloff = 0;
+    float pad0 = 0;
+};
+
+using direct_phase_accum_t = direct_accum_t;
+
+
+enum class backend_state_t {
+    unavailable,
+    initialized,
+    failed
+};
+
+struct stats_t {
+    std::uint64_t batches = 0;
+    std::uint64_t rays = 0;
+    std::uint64_t gpu_batches = 0;
+    std::uint64_t fallback_batches = 0;
+};
+
+bool requested();
+backend_state_t state();
+const char *state_string();
+const char *last_error();
+stats_t stats();
+
+bool init(const mbsp_t *bsp);
+void shutdown();
+
+// Returns true when the batch was handled by the GPU backend. Returns false to
+// tell the caller to run the existing CPU/Embree path.
+bool trace_occlusion_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const ray_t *rays,
+    occlusion_result_t *results,
+    std::size_t count);
+
+
+bool trace_direct_phase_batch(
+    const direct_phase_source_t *sources,
+    std::size_t source_count,
+    const direct_phase_sample_t *samples,
+    direct_phase_accum_t *accum,
+    std::size_t sample_count);
+
+bool trace_direct_accumulate_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const direct_job_t *jobs,
+    std::size_t job_count,
+    const direct_sample_range_t *ranges,
+    direct_accum_t *accum,
+    std::size_t sample_count);
+
+} // namespace gpu_light
+
+// C-style wrappers are easier to call from older code paths.
+bool GPU_TraceInit(const mbsp_t *bsp);
+void GPU_TraceShutdown();
+bool GPU_TraceAvailable();
+const char *GPU_TraceLastError();
+
+// Implemented in light/ltface.cc by the v5 overlay; flushes pending sample-driven direct-light work.
+void GPU_DirectQueue_Flush(const mbsp_t *bsp);
diff --git a/light/CMakeLists.txt b/light/CMakeLists.txt
index 030bde65..73ffcda6 100644
--- a/light/CMakeLists.txt
+++ b/light/CMakeLists.txt
@@ -1,3 +1,4 @@
+option(LIGHT_ENABLE_VULKAN_GPU "Enable Vulkan GPU ray-query backend for light" OFF)
 option(SKIP_TBB_INSTALL "Skip TBB Library Installation" OFF)
 option(SKIP_EMBREE_INSTALL "Skip Embree Library Installation" OFF)
 
@@ -9,7 +10,7 @@ set(LIGHT_INCLUDES
 	../include/light/bounce.hh
 	../include/light/surflight.hh
 	../include/light/ltface.hh
-	../include/light/trace.hh
+	../include/light/trace.hh ../include/light/trace_gpu.hh
 	../include/light/write.hh
 	../include/light/spatialindex.hh
 )
@@ -47,9 +48,40 @@ endif(embree_FOUND)
 
 add_library(liblight STATIC ${LIGHT_SOURCES})
 
+if (LIGHT_ENABLE_VULKAN_GPU)
+    find_package(Vulkan REQUIRED)
+    find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED)
+
+    target_sources(liblight PRIVATE
+        trace_gpu.cc
+        trace_gpu_vulkan.cc
+    )
+    target_compile_definitions(liblight PRIVATE HAVE_GPU_LIGHT=1)
+    target_link_libraries(liblight PRIVATE Vulkan::Vulkan)
+
+    set(GPU_SHADER_SPVS)
+    foreach(GPU_SHADER_NAME occlusion direct_phase)
+        set(GPU_SHADER_SRC "${CMAKE_CURRENT_SOURCE_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp")
+        set(GPU_SHADER_SPV "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp.spv")
+        add_custom_command(
+            OUTPUT "${GPU_SHADER_SPV}"
+            COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders"
+            COMMAND "${GLSLANG_VALIDATOR}" -V "${GPU_SHADER_SRC}" -o "${GPU_SHADER_SPV}"
+            DEPENDS "${GPU_SHADER_SRC}"
+            VERBATIM)
+        list(APPEND GPU_SHADER_SPVS "${GPU_SHADER_SPV}")
+    endforeach()
+    add_custom_target(light_gpu_shaders DEPENDS ${GPU_SHADER_SPVS})
+    add_dependencies(liblight light_gpu_shaders)
+endif()
+
 target_link_libraries(liblight PRIVATE common ${CMAKE_THREAD_LIBS_INIT} fmt::fmt jsoncpp_static)
 
 add_executable(light main.cc)
+if (LIGHT_ENABLE_VULKAN_GPU)
+    add_dependencies(light light_gpu_shaders)
+endif()
+
 target_link_libraries(light PRIVATE common liblight)
 
 if (embree_FOUND)
diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp
new file mode 100644
index 00000000..663ba73f
--- /dev/null
+++ b/light/gpu_shaders/direct_phase.comp
@@ -0,0 +1,180 @@
+#version 460
+#extension GL_EXT_ray_query : require
+#extension GL_EXT_scalar_block_layout : require
+
+layout(local_size_x = 64) in;
+
+struct GpuDirectPhaseSample {
+    vec3 position;
+    float occlusion;
+    vec3 normal;
+    float twosided;
+};
+
+struct GpuDirectPhaseSource {
+    vec3 position;
+    float light;
+    vec3 direction;
+    float dist;
+    vec3 color;
+    float atten;
+    uint type;      // 0 = point, 1 = sun
+    uint formula;   // light_formula_t for point lights
+    uint flags;     // bit 0 = dirt
+    uint reserved0;
+    float anglescale;
+    float dirt;
+    float falloff;
+    float pad0;
+};
+
+struct GpuDirectAccum {
+    vec3 color;
+    float pad0;
+    vec3 normal;
+    float pad1;
+    uint hit;
+    uint reserved0;
+    uint reserved1;
+    uint reserved2;
+};
+
+layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS;
+layout(std430, set = 0, binding = 1) readonly buffer Samples { GpuDirectPhaseSample samples[]; } sampleBuffer;
+layout(std430, set = 0, binding = 2) readonly buffer Sources { GpuDirectPhaseSource sources[]; } sourceBuffer;
+layout(std430, set = 0, binding = 3) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer;
+
+layout(push_constant) uniform PushConstants {
+    uint sampleCount;
+    uint sourceCount;
+    uint flags;
+    uint reserved0;
+} pc;
+
+bool occluded(vec3 origin, vec3 dir, float tmax) {
+    rayQueryEXT rq;
+    rayQueryInitializeEXT(
+        rq,
+        sceneAS,
+        gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT,
+        0xff,
+        origin,
+        0.01,
+        dir,
+        max(tmax, 0.02));
+
+    while (rayQueryProceedEXT(rq)) {}
+
+    return rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT;
+}
+
+float point_light_value(uint formula, float light, float atten, float dist, float falloff) {
+    float d = max(dist, 1.0);
+    float a = max(atten, 0.0001);
+
+    // Mirrors the broad ericw-tools delay/formula families well enough for the
+    // experimental GPU fast path. Exact exotic cases should stay on CPU.
+    if (formula == 1u) {              // LF_INVERSE
+        return light * 128.0 / (d * a);
+    } else if (formula == 2u) {       // LF_INVERSE2
+        return light * 128.0 * 128.0 / (d * d * a);
+    } else if (formula == 3u) {       // LF_INFINITE
+        return light;
+    } else if (formula == 5u) {       // LF_INVERSE2A
+        float da = d + 128.0;
+        return light * 128.0 * 128.0 / (da * da * a);
+    } else if (formula == 6u) {       // LF_QRAD3-ish
+        float qd = max(d, 16.0);
+        return light * 128.0 * 128.0 / (qd * qd * a);
+    }
+
+    // LF_LINEAR. If _falloff is set, use it as the zero point. Otherwise the
+    // classic formula is light - distance * attenuation.
+    if (falloff > 0.0) {
+        return light * max(0.0, 1.0 - d / falloff);
+    }
+    return light - d * a;
+}
+
+void main() {
+    uint sample_id = gl_GlobalInvocationID.x;
+    if (sample_id >= pc.sampleCount) {
+        return;
+    }
+
+    GpuDirectPhaseSample s = sampleBuffer.samples[sample_id];
+    if (s.twosided < -0.5) {
+        accumBuffer.accum[sample_id].color = vec3(0.0);
+        accumBuffer.accum[sample_id].pad0 = 0.0;
+        accumBuffer.accum[sample_id].normal = vec3(0.0);
+        accumBuffer.accum[sample_id].pad1 = 0.0;
+        accumBuffer.accum[sample_id].hit = 0u;
+        accumBuffer.accum[sample_id].reserved0 = 0u;
+        accumBuffer.accum[sample_id].reserved1 = 0u;
+        accumBuffer.accum[sample_id].reserved2 = 0u;
+        return;
+    }
+    vec3 total_color = vec3(0.0);
+    vec3 total_normal = vec3(0.0);
+    uint any_hit = 0u;
+
+    for (uint source_id = 0u; source_id < pc.sourceCount; ++source_id) {
+        GpuDirectPhaseSource l = sourceBuffer.sources[source_id];
+
+        vec3 ray_dir;
+        float ray_dist;
+        float value;
+        vec3 ncontrib_dir;
+
+        if (l.type == 1u) {
+            ray_dir = normalize(l.direction);
+            ray_dist = l.dist;
+            float angle = dot(ray_dir, s.normal);
+            if (s.twosided > 0.5 && angle < 0.0) angle = -angle;
+            angle = max(0.0, angle);
+            angle = (1.0 - l.anglescale) + l.anglescale * angle;
+            value = l.light * angle;
+            ncontrib_dir = ray_dir;
+        } else {
+            vec3 to_light = l.position - s.position;
+            ray_dist = length(to_light);
+            if (ray_dist <= 0.01) {
+                continue;
+            }
+            ray_dir = to_light / ray_dist;
+            float angle = dot(ray_dir, s.normal);
+            if (s.twosided > 0.5 && angle < 0.0) angle = -angle;
+            if (angle <= 0.0) {
+                continue;
+            }
+            angle = (1.0 - l.anglescale) + l.anglescale * max(0.0, angle);
+            value = point_light_value(l.formula, l.light, l.atten, ray_dist, l.falloff) * angle;
+            ncontrib_dir = ray_dir;
+        }
+
+        if (value <= 0.0) {
+            continue;
+        }
+
+        float dirt_scale = ((l.flags & 1u) != 0u) ? clamp(s.occlusion, 0.0, 1.0) : 1.0;
+        value *= dirt_scale;
+        if (value <= 0.0) {
+            continue;
+        }
+
+        if (!occluded(s.position, ray_dir, ray_dist)) {
+            total_color += l.color * (value / 255.0);
+            total_normal += ncontrib_dir * value;
+            any_hit = 1u;
+        }
+    }
+
+    accumBuffer.accum[sample_id].color = total_color;
+    accumBuffer.accum[sample_id].pad0 = 0.0;
+    accumBuffer.accum[sample_id].normal = total_normal;
+    accumBuffer.accum[sample_id].pad1 = 0.0;
+    accumBuffer.accum[sample_id].hit = any_hit;
+    accumBuffer.accum[sample_id].reserved0 = 0u;
+    accumBuffer.accum[sample_id].reserved1 = 0u;
+    accumBuffer.accum[sample_id].reserved2 = 0u;
+}
diff --git a/light/gpu_shaders/occlusion.comp b/light/gpu_shaders/occlusion.comp
new file mode 100644
index 00000000..d60d4fe8
--- /dev/null
+++ b/light/gpu_shaders/occlusion.comp
@@ -0,0 +1,59 @@
+#version 460
+#extension GL_EXT_ray_query : require
+#extension GL_EXT_scalar_block_layout : require
+
+layout(local_size_x = 128) in;
+
+struct GpuRay {
+    float ox; float oy; float oz; float tmin;
+    float dx; float dy; float dz; float tmax;
+    uint shadowMask;
+    uint userIndex;
+};
+
+struct GpuOcclusionResult {
+    uint occluded;
+    uint reserved0;
+    float tr;
+    float tg;
+    float tb;
+};
+
+layout(push_constant) uniform PushConstants {
+    uint rayCount;
+    uint flags;
+} pc;
+
+layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS;
+layout(scalar, set = 0, binding = 1) readonly buffer RayBuffer { GpuRay rays[]; } rayBuffer;
+layout(scalar, set = 0, binding = 2) writeonly buffer ResultBuffer { GpuOcclusionResult results[]; } resultBuffer;
+
+void main() {
+    uint i = gl_GlobalInvocationID.x;
+    if (i >= pc.rayCount) {
+        return;
+    }
+
+    GpuRay r = rayBuffer.rays[i];
+
+    rayQueryEXT rq;
+    rayQueryInitializeEXT(
+        rq,
+        sceneAS,
+        gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT,
+        0xff,
+        vec3(r.ox, r.oy, r.oz),
+        r.tmin,
+        normalize(vec3(r.dx, r.dy, r.dz)),
+        r.tmax);
+
+    while (rayQueryProceedEXT(rq)) {
+    }
+
+    bool hit = rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT;
+    resultBuffer.results[i].occluded = hit ? 1u : 0u;
+    resultBuffer.results[i].reserved0 = 0u;
+    resultBuffer.results[i].tr = 1.0;
+    resultBuffer.results[i].tg = 1.0;
+    resultBuffer.results[i].tb = 1.0;
+}
diff --git a/light/light.cc b/light/light.cc
index a065f820..55ff4709 100644
--- a/light/light.cc
+++ b/light/light.cc
@@ -31,6 +31,7 @@
 #include <light/ltface.hh>
 #include <light/write.hh> // for facesup_t
 #include <light/trace_embree.hh>
+#include <light/trace_gpu.hh>
 
 #include <common/log.hh>
 #include <common/bsputils.hh>
@@ -291,7 +292,7 @@ light_settings::light_settings()
       write_normals{this, "wrnormals", false, &output_group, "output normals, tangents and bitangents in a BSPX lump"},
       novanilla{this, "novanilla", false, &experimental_group, "implies -bspxlit; don't write vanilla lighting"},
       gate{this, "gate", LIGHT_EQUAL_EPSILON, &performance_group, "cutoff lights at this brightness level"},
-      sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"},
+      sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"},
       arghradcompat{this, "arghradcompat", false, &output_group, "enable compatibility for Arghrad-specific keys"},
       nolighting{this, "nolighting", false, &output_group, "don't output main world lighting (Q2RTX)"},
       debugface{this, "debugface", std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
@@ -1339,6 +1340,16 @@ int light_main(int argc, const char **argv)
     FindDebugVert(&bsp);
 
     Embree_TraceInit(&bsp);
+#if defined(HAVE_GPU_LIGHT)
+    if (light_options.gpu.value()) {
+        if (!GPU_TraceInit(&bsp)) {
+            logging::print("WARNING: -gpu requested, but GPU trace init failed: {}\n", GPU_TraceLastError());
+        } else {
+            logging::print("GPU light tracing enabled.\n");
+        }
+    }
+#endif
+
 
     if (light_options.debugmode == debugmodes::phong_obj) {
         CalculateVertexNormals(&bsp);
@@ -1409,7 +1420,14 @@ int light_main(int argc, const char **argv)
     logging::print("{} empty lightmaps\n", static_cast<int>(fully_transparent_lightmaps));
     logging::close();
 
-    return 0;
+    
+#if defined(HAVE_GPU_LIGHT)
+    if (light_options.gpu.value()) {
+        GPU_DirectQueue_Flush(&bsp);
+        GPU_TraceShutdown();
+    }
+#endif
+return 0;
 }
 
 int light_main(const std::vector<std::string> &args)
diff --git a/light/ltface.cc b/light/ltface.cc
index 4634e54e..fc7c0197 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -18,6 +18,10 @@
 */
 
 #include <light/ltface.hh>
+#include <chrono>
+#include <cstdint>
+#include <mutex>
+#include <light/trace_gpu.hh>
 
 #include <light/light.hh>
 #include <light/trace_embree.hh>
@@ -2556,6 +2560,400 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const
     return Lightsurf_Init(modelinfo, cfg, face, bsp, facesup, facesup_decoupled);
 }
 
+
+
+#if defined(HAVE_GPU_LIGHT)
+static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps)
+{
+    // v5 disabled: per-face GPU direct was slower than Embree.
+    return false;
+
+    // v4 disabled: per-face GPU direct was slower than Embree.
+    return false;
+
+    // Disabled: this per-face GPU direct path is currently slower than Embree.
+    // It is not the final whole-phase batching architecture.
+    return false;
+
+    if (!GPU_TraceAvailable()) {
+        return false;
+    }
+
+    constexpr std::size_t GPU_DIRECT_MIN_JOBS = 32768;
+
+    const settings::worldspawn_keys &cfg = *lightsurf->cfg;
+    const modelinfo_t *modelinfo = lightsurf->modelinfo;
+    const qplane3f &plane = lightsurf->plane;
+    const std::size_t sample_count = lightsurf->samples.size();
+    if (!sample_count) {
+        return true;
+    }
+
+    std::vector<std::vector<gpu_light::direct_job_t>> per_sample(sample_count);
+
+    auto add_job = [&](int sample_index, const qvec3f &origin, const qvec3f &direction, float dist, const qvec3f &color, const qvec3f &normalcontrib) {
+        gpu_light::direct_job_t job{};
+        job.ox = origin[0];
+        job.oy = origin[1];
+        job.oz = origin[2];
+        job.tmin = 0.01f;
+        job.dx = direction[0];
+        job.dy = direction[1];
+        job.dz = direction[2];
+        job.tmax = dist;
+        job.cr = color[0];
+        job.cg = color[1];
+        job.cb = color[2];
+        job.nr = normalcontrib[0];
+        job.ng = normalcontrib[1];
+        job.nb = normalcontrib[2];
+        job.sample_index = static_cast<std::uint32_t>(sample_index);
+        per_sample[static_cast<std::size_t>(sample_index)].push_back(job);
+    };
+
+    // Entity lights.  This fast path is deliberately style-0/default-channel only.
+    for (const auto &entity_ptr : GetLights()) {
+        const light_t *entity = entity_ptr.get();
+        if (entity->getFormula() == LF_LOCALMIN) continue;
+        if (entity->nostaticlight.value()) continue;
+        if (entity->light.value() <= 0) continue;
+
+        if (entity->style.value() != 0) return false;
+        if (entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false;
+        if (entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false;
+
+        if (light_options.visapprox.value() == visapprox_t::VIS &&
+            entity->light_channel_mask.value() == CHANNEL_MASK_DEFAULT &&
+            entity->shadow_channel_mask.value() == CHANNEL_MASK_DEFAULT &&
+            VisCullEntity(bsp, lightsurf->pvs, entity->leaf)) {
+            continue;
+        }
+
+        const float planedist = plane.distance_to(entity->origin.value());
+        if (planedist < 0 && !entity->bleed.value() && !lightsurf->curved && !lightsurf->twosided) {
+            continue;
+        }
+        if (CullLight(entity, lightsurf)) {
+            continue;
+        }
+        if (!(entity->light_channel_mask.value() & lightsurf->object_channel_mask)) {
+            continue;
+        }
+
+        for (int i = 0; i < static_cast<int>(lightsurf->samples.size()); i++) {
+            const auto &sample = lightsurf->samples[i];
+            if (sample.occluded) continue;
+
+            const qvec3f &surfpoint = sample.point;
+            const qvec3f &surfnorm = sample.normal;
+            qvec3f surfpointToLightDir;
+            float surfpointToLightDist;
+            qvec3f color;
+            qvec3f normalcontrib;
+            GetLightContrib(cfg, entity, surfnorm, true, surfpoint, lightsurf->twosided, color, surfpointToLightDir, normalcontrib, &surfpointToLightDist);
+            const float occlusion = Dirt_GetScaleFactor(cfg, sample.occlusion, entity, surfpointToLightDist, lightsurf);
+            color *= occlusion;
+            if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) {
+                continue;
+            }
+            add_job(i, surfpoint, surfpointToLightDir, surfpointToLightDist, color, normalcontrib);
+        }
+    }
+
+    // Sunlight.  The GPU AS contains opaque solids only, so a miss is treated as visible sky.
+    // Sun texture filtering and non-zero styles stay on the CPU path.
+    for (const sun_t &sun : GetSuns()) {
+        if (sun.sunlight <= 0) continue;
+        if (sun.style != 0) return false;
+        if (sun.suntexture_value) return false;
+
+        qvec3f incoming = qv::normalize(sun.sunvec);
+        const float dp = qv::dot(incoming, plane.normal);
+        if (dp < -LIGHT_ANGLE_EPSILON && !lightsurf->curved && !lightsurf->twosided) {
+            continue;
+        }
+        if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) {
+            continue;
+        }
+
+        for (int i = 0; i < static_cast<int>(lightsurf->samples.size()); i++) {
+            const auto &sample = lightsurf->samples[i];
+            if (sample.occluded) continue;
+
+            const qvec3f &surfpoint = sample.point;
+            const qvec3f &surfnorm = sample.normal;
+            float angle = qv::dot(incoming, surfnorm);
+            if (lightsurf->twosided && angle < 0) {
+                angle = -angle;
+            }
+            angle = std::max(0.0f, angle);
+            angle = (1.0f - sun.anglescale) + sun.anglescale * angle;
+            float value = angle * sun.sunlight;
+            if (sun.dirt) {
+                value *= Dirt_GetScaleFactor(cfg, sample.occlusion, NULL, 0.0f, lightsurf);
+            }
+            qvec3f color = sun.sunlight_color * (value / 255.0f);
+            if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) {
+                continue;
+            }
+            qvec3f normalcontrib = incoming * value;
+            add_job(i, surfpoint, incoming, MAX_SKY_DIST, color, normalcontrib);
+        }
+    }
+
+    std::size_t job_count = 0;
+    for (const auto &v : per_sample) {
+        job_count += v.size();
+    }
+    if (job_count == 0) {
+        return true;
+    }
+    if (job_count < GPU_DIRECT_MIN_JOBS) {
+        return false;
+    }
+
+    std::vector<gpu_light::direct_job_t> jobs;
+    std::vector<gpu_light::direct_sample_range_t> ranges(sample_count);
+    jobs.reserve(job_count);
+    for (std::size_t i = 0; i < sample_count; ++i) {
+        ranges[i].first = static_cast<std::uint32_t>(jobs.size());
+        ranges[i].count = static_cast<std::uint32_t>(per_sample[i].size());
+        jobs.insert(jobs.end(), per_sample[i].begin(), per_sample[i].end());
+    }
+
+    std::vector<gpu_light::direct_accum_t> accum(sample_count);
+    if (!gpu_light::trace_direct_accumulate_batch(
+            modelinfo,
+            CHANNEL_MASK_DEFAULT,
+            jobs.data(),
+            jobs.size(),
+            ranges.data(),
+            accum.data(),
+            sample_count)) {
+        return false;
+    }
+
+    lightmap_t *lightmap = Lightmap_ForStyle(lightmaps, 0, lightsurf);
+    bool hit = false;
+    for (std::size_t i = 0; i < sample_count; ++i) {
+        if (!accum[i].hit) continue;
+        const qvec3f color{accum[i].cr, accum[i].cg, accum[i].cb};
+        const qvec3f normalcontrib{accum[i].nr, accum[i].ng, accum[i].nb};
+        lightsample_t &sample = lightmap->samples[i];
+        sample.color += color;
+        sample.direction += normalcontrib;
+        lightmap->bounce_color += color;
+        hit = true;
+    }
+    if (hit) {
+        Lightmap_Save(bsp, lightmaps, lightsurf, lightmap, 0);
+    }
+    return true;
+}
+#endif
+
+
+
+
+
+
+#if defined(HAVE_GPU_LIGHT)
+namespace {
+struct gpu_direct_face_record_t {
+    lightsurf_t *lightsurf = nullptr;
+    lightmapdict_t *lightmaps = nullptr;
+    std::size_t first_sample = 0;
+    std::size_t sample_count = 0;
+};
+
+std::mutex g_gpu_direct_queue_mutex;
+std::vector<gpu_light::direct_phase_sample_t> g_gpu_direct_samples;
+std::vector<gpu_light::direct_phase_source_t> g_gpu_direct_sources;
+std::vector<gpu_direct_face_record_t> g_gpu_direct_faces;
+bool g_gpu_direct_sources_built = false;
+bool g_gpu_direct_disabled = false;
+
+static constexpr std::size_t GPU_DIRECT_FLUSH_SAMPLES = 1024ull * 1024ull;
+
+static bool GPU_DirectQueue_BuildSourcesLocked()
+{
+    if (g_gpu_direct_sources_built) {
+        return !g_gpu_direct_disabled;
+    }
+    g_gpu_direct_sources_built = true;
+    g_gpu_direct_sources.clear();
+
+    for (const auto &entity_ptr : GetLights()) {
+        const light_t *entity = entity_ptr.get();
+        if (entity->nostaticlight.value()) continue;
+        if (entity->light.value() <= 0) continue;
+        if (entity->sun.value()) continue;
+
+        if (entity->style.value() != 0 ||
+            entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT ||
+            entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT ||
+            entity->spotlight || entity->projectedmip ||
+            entity->getFormula() == LF_LOCALMIN) {
+            logging::print("GPU direct phase: unsupported entity light encountered; falling back to CPU direct path.\n");
+            g_gpu_direct_disabled = true;
+            return false;
+        }
+
+        gpu_light::direct_phase_source_t src{};
+        const qvec3f origin = entity->origin.value();
+        const qvec3f color = entity->color.value();
+        src.px = origin[0]; src.py = origin[1]; src.pz = origin[2];
+        src.light = entity->light.value();
+        src.dx = 0; src.dy = 0; src.dz = 1; src.dist = 0;
+        src.cr = color[0]; src.cg = color[1]; src.cb = color[2];
+        src.atten = entity->atten.value();
+        src.type = 0;
+        src.formula = static_cast<std::uint32_t>(entity->getFormula());
+        src.flags = entity->dirt.value() ? 1u : 0u;
+        src.anglescale = entity->anglescale.value();
+        src.dirt = entity->dirt.value();
+        src.falloff = entity->falloff.value();
+        g_gpu_direct_sources.push_back(src);
+    }
+
+    for (const sun_t &sun : GetSuns()) {
+        if (sun.sunlight <= 0) continue;
+        if (sun.style != 0 || sun.suntexture_value) {
+            logging::print("GPU direct phase: unsupported sun style/texture encountered; falling back to CPU direct path.\n");
+            g_gpu_direct_disabled = true;
+            return false;
+        }
+        qvec3f incoming = qv::normalize(sun.sunvec);
+        gpu_light::direct_phase_source_t src{};
+        src.type = 1;
+        src.dx = incoming[0]; src.dy = incoming[1]; src.dz = incoming[2];
+        src.dist = MAX_SKY_DIST;
+        src.light = sun.sunlight;
+        src.cr = sun.sunlight_color[0]; src.cg = sun.sunlight_color[1]; src.cb = sun.sunlight_color[2];
+        src.atten = 1;
+        src.formula = 0;
+        src.flags = sun.dirt ? 1u : 0u;
+        src.anglescale = sun.anglescale;
+        src.dirt = sun.dirt ? 1.0f : 0.0f;
+        g_gpu_direct_sources.push_back(src);
+    }
+
+    logging::print("GPU direct phase: queued {} compatible direct sources.\n", g_gpu_direct_sources.size());
+    if (g_gpu_direct_sources.empty()) {
+        return true;
+    }
+    return true;
+}
+
+static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp)
+{
+    if (g_gpu_direct_samples.empty()) {
+        g_gpu_direct_faces.clear();
+        return true;
+    }
+
+    const auto t0 = std::chrono::steady_clock::now();
+    std::vector<gpu_light::direct_phase_accum_t> accum(g_gpu_direct_samples.size());
+    const bool ok = gpu_light::trace_direct_phase_batch(
+        g_gpu_direct_sources.data(),
+        g_gpu_direct_sources.size(),
+        g_gpu_direct_samples.data(),
+        accum.data(),
+        g_gpu_direct_samples.size());
+    const auto t1 = std::chrono::steady_clock::now();
+    const double gpu_ms = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000.0;
+
+    if (!ok) {
+        g_gpu_direct_disabled = true;
+        logging::print("ERROR: GPU direct phase dispatch failed: {}\n", GPU_TraceLastError());
+        logging::print("ERROR: disabling GPU direct phase for the rest of this run. Re-run without -gpu for guaranteed CPU output.\n");
+        g_gpu_direct_samples.clear();
+        g_gpu_direct_faces.clear();
+        return false;
+    }
+
+    for (const auto &rec : g_gpu_direct_faces) {
+        if (!rec.lightsurf || !rec.lightmaps || rec.sample_count == 0) {
+            continue;
+        }
+        lightmap_t *lightmap = Lightmap_ForStyle(rec.lightmaps, 0, rec.lightsurf);
+        bool hit = false;
+        for (std::size_t i = 0; i < rec.sample_count; ++i) {
+            const std::size_t gi = rec.first_sample + i;
+            if (!accum[gi].hit) continue;
+            const qvec3f color{accum[gi].cr, accum[gi].cg, accum[gi].cb};
+            const qvec3f normalcontrib{accum[gi].nr, accum[gi].ng, accum[gi].nb};
+            lightsample_t &sample = lightmap->samples[i];
+            sample.color += color;
+            sample.direction += normalcontrib;
+            lightmap->bounce_color += color;
+            hit = true;
+        }
+        if (hit) {
+            Lightmap_Save(bsp, rec.lightmaps, rec.lightsurf, lightmap, 0);
+        }
+    }
+
+    const std::uint64_t implicit_rays = static_cast<std::uint64_t>(g_gpu_direct_samples.size()) * static_cast<std::uint64_t>(g_gpu_direct_sources.size());
+    logging::print("GPU direct phase: flushed {} samples x {} sources = {} implicit rays in {:.3f} ms\n",
+        g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), implicit_rays, gpu_ms);
+
+    g_gpu_direct_samples.clear();
+    g_gpu_direct_faces.clear();
+    return true;
+}
+} // namespace
+
+void GPU_DirectQueue_Flush(const mbsp_t *bsp)
+{
+    std::lock_guard<std::mutex> lock(g_gpu_direct_queue_mutex);
+    GPU_DirectQueue_FlushLocked(bsp);
+}
+
+static bool GPU_DirectQueue_AddFace(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps)
+{
+    if (!GPU_TraceAvailable() || g_gpu_direct_disabled || !lightsurf || !lightmaps) {
+        return false;
+    }
+    if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) {
+        return true;
+    }
+    const std::size_t sample_count = lightsurf->samples.size();
+    if (!sample_count) {
+        return true;
+    }
+
+    std::lock_guard<std::mutex> lock(g_gpu_direct_queue_mutex);
+    if (!GPU_DirectQueue_BuildSourcesLocked()) {
+        return false;
+    }
+    if (g_gpu_direct_sources.empty()) {
+        return true;
+    }
+
+    const std::size_t first_sample = g_gpu_direct_samples.size();
+    g_gpu_direct_faces.push_back(gpu_direct_face_record_t{lightsurf, lightmaps, first_sample, sample_count});
+
+    for (const auto &sample : lightsurf->samples) {
+        gpu_light::direct_phase_sample_t s{};
+        if (!sample.occluded) {
+            s.px = sample.point[0]; s.py = sample.point[1]; s.pz = sample.point[2];
+            s.nx = sample.normal[0]; s.ny = sample.normal[1]; s.nz = sample.normal[2];
+            s.occlusion = sample.occlusion;
+            s.twosided = lightsurf->twosided ? 1.0f : 0.0f;
+        } else {
+            s.twosided = -1.0f; // sentinel: shader skips occluded/invalid samples
+        }
+        g_gpu_direct_samples.push_back(s);
+    }
+
+    if (g_gpu_direct_samples.size() >= GPU_DIRECT_FLUSH_SAMPLES) {
+        GPU_DirectQueue_FlushLocked(bsp);
+    }
+    return true;
+}
+#endif
+
 /*
  * ============
  * LightFace
@@ -2587,7 +2985,10 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings::
 
         /* positive lights */
         if (!(modelinfo->lightignore.value() || extended_flags.light_ignore)) {
-            for (const auto &entity : GetLights()) {
+            #if defined(HAVE_GPU_LIGHT)
+            if (!GPU_DirectQueue_AddFace(bsp, &lightsurf, lightmaps)) {
+#endif
+for (const auto &entity : GetLights()) {
                 if (entity->getFormula() == LF_LOCALMIN)
                     continue;
                 if (entity->nostaticlight.value())
@@ -2598,6 +2999,9 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings::
             for (const sun_t &sun : GetSuns())
                 if (sun.sunlight > 0)
                     LightFace_Sky(bsp, &sun, &lightsurf, lightmaps);
+#if defined(HAVE_GPU_LIGHT)
+            }
+#endif
 
             // mxd. Add surface lights...
             // FIXME: negative surface lights
diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc
new file mode 100644
index 00000000..e20a33fd
--- /dev/null
+++ b/light/trace_gpu.cc
@@ -0,0 +1,233 @@
+#include <light/trace_gpu.hh>
+
+#include <atomic>
+#include <mutex>
+#include <string>
+
+#if defined(HAVE_GPU_LIGHT)
+namespace gpu_light::vulkan_backend {
+bool init(const mbsp_t *bsp, std::string &error);
+void shutdown();
+bool trace_occlusion_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const gpu_light::ray_t *rays,
+    gpu_light::occlusion_result_t *results,
+    std::size_t count,
+    std::string &error);
+
+bool trace_direct_phase_batch(
+    const gpu_light::direct_phase_source_t *sources,
+    std::size_t source_count,
+    const gpu_light::direct_phase_sample_t *samples,
+    gpu_light::direct_phase_accum_t *accum,
+    std::size_t sample_count,
+    std::string &error);
+
+bool trace_direct_accumulate_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const gpu_light::direct_job_t *jobs,
+    std::size_t job_count,
+    const gpu_light::direct_sample_range_t *ranges,
+    gpu_light::direct_accum_t *accum,
+    std::size_t sample_count,
+    std::string &error);
+} // namespace gpu_light::vulkan_backend
+#endif
+
+namespace gpu_light {
+namespace {
+std::mutex g_mutex;
+backend_state_t g_state = backend_state_t::unavailable;
+std::string g_last_error;
+stats_t g_stats;
+} // namespace
+
+bool requested() {
+    // The apply script wires this to light_options.gpu in the call site. Keeping
+    // this function independent avoids pulling all light settings into this TU.
+    return true;
+}
+
+backend_state_t state() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    return g_state;
+}
+
+const char *state_string() {
+    switch (state()) {
+    case backend_state_t::unavailable: return "unavailable";
+    case backend_state_t::initialized: return "initialized";
+    case backend_state_t::failed: return "failed";
+    }
+    return "unknown";
+}
+
+const char *last_error() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    return g_last_error.c_str();
+}
+
+stats_t stats() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    return g_stats;
+}
+
+bool init(const mbsp_t *bsp) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+#if defined(HAVE_GPU_LIGHT)
+    g_last_error.clear();
+    if (vulkan_backend::init(bsp, g_last_error)) {
+        g_state = backend_state_t::initialized;
+        return true;
+    }
+    g_state = backend_state_t::failed;
+    return false;
+#else
+    (void)bsp;
+    g_last_error = "light was built without LIGHT_ENABLE_VULKAN_GPU=ON";
+    g_state = backend_state_t::unavailable;
+    return false;
+#endif
+}
+
+void shutdown() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+#if defined(HAVE_GPU_LIGHT)
+    vulkan_backend::shutdown();
+#endif
+    g_state = backend_state_t::unavailable;
+}
+
+bool trace_occlusion_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const ray_t *rays,
+    occlusion_result_t *results,
+    std::size_t count) {
+    if (!rays || !results || count == 0) {
+        return true;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(g_mutex);
+        g_stats.batches++;
+        g_stats.rays += count;
+        if (g_state != backend_state_t::initialized) {
+            g_stats.fallback_batches++;
+            return false;
+        }
+    }
+
+#if defined(HAVE_GPU_LIGHT)
+    std::string error;
+    const bool ok = vulkan_backend::trace_occlusion_batch(self, shadow_mask, rays, results, count, error);
+    std::lock_guard<std::mutex> lock(g_mutex);
+    if (ok) {
+        g_stats.gpu_batches++;
+        return true;
+    }
+    g_stats.fallback_batches++;
+    if (!error.empty()) {
+        g_last_error = error;
+    }
+    return false;
+#else
+    (void)self;
+    (void)shadow_mask;
+    return false;
+#endif
+}
+
+
+bool trace_direct_phase_batch(
+    const direct_phase_source_t *sources,
+    std::size_t source_count,
+    const direct_phase_sample_t *samples,
+    direct_phase_accum_t *accum,
+    std::size_t sample_count) {
+    if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) {
+        return true;
+    }
+
+    const std::uint64_t implicit_rays = static_cast<std::uint64_t>(source_count) * static_cast<std::uint64_t>(sample_count);
+    {
+        std::lock_guard<std::mutex> lock(g_mutex);
+        g_stats.batches++;
+        g_stats.rays += implicit_rays;
+        if (g_state != backend_state_t::initialized) {
+            g_stats.fallback_batches++;
+            return false;
+        }
+    }
+
+#if defined(HAVE_GPU_LIGHT)
+    std::string error;
+    const bool ok = vulkan_backend::trace_direct_phase_batch(
+        sources, source_count, samples, accum, sample_count, error);
+    std::lock_guard<std::mutex> lock(g_mutex);
+    if (ok) {
+        g_stats.gpu_batches++;
+        return true;
+    }
+    g_stats.fallback_batches++;
+    if (!error.empty()) {
+        g_last_error = error;
+    }
+    return false;
+#else
+    return false;
+#endif
+}
+
+
+bool trace_direct_accumulate_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const direct_job_t *jobs,
+    std::size_t job_count,
+    const direct_sample_range_t *ranges,
+    direct_accum_t *accum,
+    std::size_t sample_count) {
+    if (!jobs || !ranges || !accum || job_count == 0 || sample_count == 0) {
+        return true;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(g_mutex);
+        g_stats.batches++;
+        g_stats.rays += job_count;
+        if (g_state != backend_state_t::initialized) {
+            g_stats.fallback_batches++;
+            return false;
+        }
+    }
+
+#if defined(HAVE_GPU_LIGHT)
+    std::string error;
+    const bool ok = vulkan_backend::trace_direct_accumulate_batch(
+        self, shadow_mask, jobs, job_count, ranges, accum, sample_count, error);
+    std::lock_guard<std::mutex> lock(g_mutex);
+    if (ok) {
+        g_stats.gpu_batches++;
+        return true;
+    }
+    g_stats.fallback_batches++;
+    if (!error.empty()) {
+        g_last_error = error;
+    }
+    return false;
+#else
+    (void)self;
+    (void)shadow_mask;
+    return false;
+#endif
+}
+
+} // namespace gpu_light
+
+bool GPU_TraceInit(const mbsp_t *bsp) { return gpu_light::init(bsp); }
+void GPU_TraceShutdown() { gpu_light::shutdown(); }
+bool GPU_TraceAvailable() { return gpu_light::state() == gpu_light::backend_state_t::initialized; }
+const char *GPU_TraceLastError() { return gpu_light::last_error(); }
diff --git a/light/trace_gpu_vulkan.cc b/light/trace_gpu_vulkan.cc
new file mode 100644
index 00000000..eda4d928
--- /dev/null
+++ b/light/trace_gpu_vulkan.cc
@@ -0,0 +1,1244 @@
+#include <light/trace_gpu.hh>
+
+#if defined(HAVE_GPU_LIGHT)
+
+#include <common/bspfile.hh>
+#include <common/log.hh>
+#include <light/light.hh>
+#include <light/trace_embree.hh>
+
+#include <vulkan/vulkan.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#if defined(__linux__)
+#include <limits.h>
+#include <unistd.h>
+#endif
+
+namespace gpu_light::vulkan_backend {
+namespace {
+
+struct buffer_t {
+    VkBuffer buffer = VK_NULL_HANDLE;
+    VkDeviceMemory memory = VK_NULL_HANDLE;
+    VkDeviceSize size = 0;
+};
+
+struct as_t {
+    VkAccelerationStructureKHR as = VK_NULL_HANDLE;
+    buffer_t storage;
+    VkDeviceAddress address = 0;
+};
+
+struct vertex_t {
+    float x, y, z;
+};
+
+struct gpu_ray_host_t {
+    float ox, oy, oz, tmin;
+    float dx, dy, dz, tmax;
+    std::uint32_t shadow_mask;
+    std::uint32_t user_index;
+};
+
+struct gpu_result_host_t {
+    std::uint32_t occluded;
+    std::uint32_t reserved0;
+    float tr, tg, tb;
+};
+
+struct gpu_direct_job_host_t {
+    float ox, oy, oz, tmin;
+    float dx, dy, dz, tmax;
+    float cr, cg, cb, pad0;
+    float nr, ng, nb, pad1;
+    std::uint32_t sample_index;
+    std::uint32_t flags;
+    std::uint32_t reserved0;
+    std::uint32_t reserved1;
+};
+
+struct gpu_direct_range_host_t {
+    std::uint32_t first;
+    std::uint32_t count;
+};
+
+struct gpu_direct_accum_host_t {
+    float cr, cg, cb, pad0;
+    float nr, ng, nb, pad1;
+    std::uint32_t hit;
+    std::uint32_t reserved0;
+    std::uint32_t reserved1;
+    std::uint32_t reserved2;
+};
+
+
+
+struct gpu_direct_phase_sample_host_t {
+    float px, py, pz, occlusion;
+    float nx, ny, nz, twosided;
+};
+
+struct gpu_direct_phase_source_host_t {
+    float px, py, pz, light;
+    float dx, dy, dz, dist;
+    float cr, cg, cb, atten;
+    std::uint32_t type;
+    std::uint32_t formula;
+    std::uint32_t flags;
+    std::uint32_t reserved0;
+    float anglescale;
+    float dirt;
+    float falloff;
+    float pad0;
+};
+struct push_constants_t {
+    std::uint32_t ray_count;
+    std::uint32_t flags;
+};
+
+struct direct_push_constants_t {
+    std::uint32_t sample_count;
+    std::uint32_t source_count;
+    std::uint32_t flags;
+    std::uint32_t reserved0;
+};
+
+static_assert(sizeof(gpu_ray_host_t) == 40, "GPU ray layout must match shader");
+static_assert(sizeof(gpu_result_host_t) == 20, "GPU result layout must match shader");
+static_assert(sizeof(gpu_direct_job_host_t) == 80, "GPU direct job layout must match shader");
+static_assert(sizeof(gpu_direct_range_host_t) == 8, "GPU direct range layout must match shader");
+static_assert(sizeof(gpu_direct_accum_host_t) == 48, "GPU direct accum layout must match shader");
+static_assert(sizeof(gpu_direct_phase_sample_host_t) == 32, "GPU direct phase sample layout must match shader");
+static_assert(sizeof(gpu_direct_phase_source_host_t) == 80, "GPU direct phase source layout must match shader");
+
+struct context_t {
+    VkInstance instance = VK_NULL_HANDLE;
+    VkPhysicalDevice physical = VK_NULL_HANDLE;
+    VkDevice device = VK_NULL_HANDLE;
+    VkQueue queue = VK_NULL_HANDLE;
+    std::uint32_t queue_family = 0;
+
+    VkPhysicalDeviceMemoryProperties memory_props{};
+
+    VkCommandPool command_pool = VK_NULL_HANDLE;
+    VkCommandBuffer command_buffer = VK_NULL_HANDLE;
+
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR_ = nullptr;
+    PFN_vkCreateAccelerationStructureKHR vkCreateAccelerationStructureKHR_ = nullptr;
+    PFN_vkDestroyAccelerationStructureKHR vkDestroyAccelerationStructureKHR_ = nullptr;
+    PFN_vkGetAccelerationStructureBuildSizesKHR vkGetAccelerationStructureBuildSizesKHR_ = nullptr;
+    PFN_vkCmdBuildAccelerationStructuresKHR vkCmdBuildAccelerationStructuresKHR_ = nullptr;
+    PFN_vkGetAccelerationStructureDeviceAddressKHR vkGetAccelerationStructureDeviceAddressKHR_ = nullptr;
+    PFN_vkCmdWriteAccelerationStructuresPropertiesKHR vkCmdWriteAccelerationStructuresPropertiesKHR_ = nullptr;
+
+    buffer_t vertices;
+    buffer_t indices;
+    buffer_t instances;
+    as_t blas;
+    as_t tlas;
+
+    VkDescriptorSetLayout descriptor_set_layout = VK_NULL_HANDLE;
+    VkPipelineLayout pipeline_layout = VK_NULL_HANDLE;
+    VkPipeline pipeline = VK_NULL_HANDLE;
+    VkDescriptorPool descriptor_pool = VK_NULL_HANDLE;
+    VkDescriptorSet descriptor_set = VK_NULL_HANDLE;
+
+    VkDescriptorSetLayout direct_descriptor_set_layout = VK_NULL_HANDLE;
+    VkPipelineLayout direct_pipeline_layout = VK_NULL_HANDLE;
+    VkPipeline direct_pipeline = VK_NULL_HANDLE;
+    VkDescriptorPool direct_descriptor_pool = VK_NULL_HANDLE;
+    VkDescriptorSet direct_descriptor_set = VK_NULL_HANDLE;
+
+    std::size_t triangle_count = 0;
+    bool has_filtered_embree_geometry = false;
+};
+
+std::mutex g_mutex;
+context_t g;
+
+static std::string vk_result_string(VkResult r) {
+    switch (r) {
+    case VK_SUCCESS: return "VK_SUCCESS";
+    case VK_NOT_READY: return "VK_NOT_READY";
+    case VK_TIMEOUT: return "VK_TIMEOUT";
+    case VK_EVENT_SET: return "VK_EVENT_SET";
+    case VK_EVENT_RESET: return "VK_EVENT_RESET";
+    case VK_INCOMPLETE: return "VK_INCOMPLETE";
+    case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY";
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+    case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED";
+    case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST";
+    case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED";
+    case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT";
+    case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT";
+    case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT";
+    case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER";
+    default: return "VkResult(" + std::to_string(static_cast<int>(r)) + ")";
+    }
+}
+
+static bool check(VkResult r, const char *what, std::string &error) {
+    if (r == VK_SUCCESS) return true;
+    error = std::string(what) + " failed: " + vk_result_string(r);
+    return false;
+}
+
+static bool has_extension(const std::vector<VkExtensionProperties> &props, const char *name) {
+    return std::any_of(props.begin(), props.end(), [&](const VkExtensionProperties &p) {
+        return std::strcmp(p.extensionName, name) == 0;
+    });
+}
+
+static void destroy_buffer(buffer_t &b) {
+    if (b.buffer) vkDestroyBuffer(g.device, b.buffer, nullptr);
+    if (b.memory) vkFreeMemory(g.device, b.memory, nullptr);
+    b = {};
+}
+
+static void destroy_as(as_t &a) {
+    if (a.as) g.vkDestroyAccelerationStructureKHR_(g.device, a.as, nullptr);
+    destroy_buffer(a.storage);
+    a = {};
+}
+
+static void destroy_locked() {
+    if (g.device) vkDeviceWaitIdle(g.device);
+
+    if (g.direct_pipeline) vkDestroyPipeline(g.device, g.direct_pipeline, nullptr);
+    if (g.direct_pipeline_layout) vkDestroyPipelineLayout(g.device, g.direct_pipeline_layout, nullptr);
+    if (g.direct_descriptor_pool) vkDestroyDescriptorPool(g.device, g.direct_descriptor_pool, nullptr);
+    if (g.direct_descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.direct_descriptor_set_layout, nullptr);
+
+    if (g.pipeline) vkDestroyPipeline(g.device, g.pipeline, nullptr);
+    if (g.pipeline_layout) vkDestroyPipelineLayout(g.device, g.pipeline_layout, nullptr);
+    if (g.descriptor_pool) vkDestroyDescriptorPool(g.device, g.descriptor_pool, nullptr);
+    if (g.descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.descriptor_set_layout, nullptr);
+
+    destroy_as(g.tlas);
+    destroy_as(g.blas);
+    destroy_buffer(g.instances);
+    destroy_buffer(g.indices);
+    destroy_buffer(g.vertices);
+
+    if (g.command_pool) vkDestroyCommandPool(g.device, g.command_pool, nullptr);
+    if (g.device) vkDestroyDevice(g.device, nullptr);
+    if (g.instance) vkDestroyInstance(g.instance, nullptr);
+    g = {};
+}
+
+static bool find_memory_type(std::uint32_t type_bits, VkMemoryPropertyFlags props, std::uint32_t &type_index) {
+    for (std::uint32_t i = 0; i < g.memory_props.memoryTypeCount; ++i) {
+        if ((type_bits & (1u << i)) && ((g.memory_props.memoryTypes[i].propertyFlags & props) == props)) {
+            type_index = i;
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool create_buffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags props, buffer_t &out,
+    std::string &error, const void *initial_data = nullptr) {
+    out = {};
+    out.size = size;
+
+    VkBufferCreateInfo bi{};
+    bi.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bi.size = size;
+    bi.usage = usage;
+    bi.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    if (!check(vkCreateBuffer(g.device, &bi, nullptr, &out.buffer), "vkCreateBuffer", error)) return false;
+
+    VkMemoryRequirements req{};
+    vkGetBufferMemoryRequirements(g.device, out.buffer, &req);
+
+    std::uint32_t mem_type = 0;
+    if (!find_memory_type(req.memoryTypeBits, props, mem_type)) {
+        error = "no compatible Vulkan memory type for buffer";
+        destroy_buffer(out);
+        return false;
+    }
+
+    VkMemoryAllocateFlagsInfo flags{};
+    flags.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO;
+    flags.flags = (usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) ? VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT : 0;
+
+    VkMemoryAllocateInfo ai{};
+    ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    ai.pNext = flags.flags ? &flags : nullptr;
+    ai.allocationSize = req.size;
+    ai.memoryTypeIndex = mem_type;
+
+    if (!check(vkAllocateMemory(g.device, &ai, nullptr, &out.memory), "vkAllocateMemory", error)) {
+        destroy_buffer(out);
+        return false;
+    }
+    if (!check(vkBindBufferMemory(g.device, out.buffer, out.memory, 0), "vkBindBufferMemory", error)) {
+        destroy_buffer(out);
+        return false;
+    }
+
+    if (initial_data) {
+        void *mapped = nullptr;
+        if (!check(vkMapMemory(g.device, out.memory, 0, size, 0, &mapped), "vkMapMemory", error)) {
+            destroy_buffer(out);
+            return false;
+        }
+        std::memcpy(mapped, initial_data, static_cast<std::size_t>(size));
+        vkUnmapMemory(g.device, out.memory);
+    }
+
+    return true;
+}
+
+static VkDeviceAddress buffer_address(const buffer_t &b) {
+    VkBufferDeviceAddressInfo info{};
+    info.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO;
+    info.buffer = b.buffer;
+    return g.vkGetBufferDeviceAddressKHR_(g.device, &info);
+}
+
+static bool one_time_submit(const std::function<void(VkCommandBuffer)> &record, std::string &error) {
+    if (!check(vkResetCommandBuffer(g.command_buffer, 0), "vkResetCommandBuffer", error)) return false;
+
+    VkCommandBufferBeginInfo bi{};
+    bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    if (!check(vkBeginCommandBuffer(g.command_buffer, &bi), "vkBeginCommandBuffer", error)) return false;
+    record(g.command_buffer);
+    if (!check(vkEndCommandBuffer(g.command_buffer), "vkEndCommandBuffer", error)) return false;
+
+    VkSubmitInfo si{};
+    si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    si.commandBufferCount = 1;
+    si.pCommandBuffers = &g.command_buffer;
+    if (!check(vkQueueSubmit(g.queue, 1, &si, VK_NULL_HANDLE), "vkQueueSubmit", error)) return false;
+    if (!check(vkQueueWaitIdle(g.queue), "vkQueueWaitIdle", error)) return false;
+    return true;
+}
+
+static bool create_instance(std::string &error) {
+    VkApplicationInfo app{};
+    app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    app.pApplicationName = "ericw-tools light gpu";
+    app.applicationVersion = VK_MAKE_VERSION(0, 2, 0);
+    app.pEngineName = "ericw-tools";
+    app.engineVersion = VK_MAKE_VERSION(0, 2, 0);
+    app.apiVersion = VK_API_VERSION_1_2;
+
+    VkInstanceCreateInfo ci{};
+    ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    ci.pApplicationInfo = &app;
+    return check(vkCreateInstance(&ci, nullptr, &g.instance), "vkCreateInstance", error);
+}
+
+static bool pick_device(std::string &error) {
+    std::uint32_t count = 0;
+    if (!check(vkEnumeratePhysicalDevices(g.instance, &count, nullptr), "vkEnumeratePhysicalDevices(count)", error)) return false;
+    if (!count) { error = "no Vulkan physical devices found"; return false; }
+
+    std::vector<VkPhysicalDevice> devices(count);
+    if (!check(vkEnumeratePhysicalDevices(g.instance, &count, devices.data()), "vkEnumeratePhysicalDevices(list)", error)) return false;
+
+    for (VkPhysicalDevice dev : devices) {
+        std::uint32_t ext_count = 0;
+        vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, nullptr);
+        std::vector<VkExtensionProperties> exts(ext_count);
+        vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, exts.data());
+
+        if (!has_extension(exts, VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) ||
+            !has_extension(exts, VK_KHR_RAY_QUERY_EXTENSION_NAME) ||
+            !has_extension(exts, VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME) ||
+            !has_extension(exts, VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME)) {
+            continue;
+        }
+
+        VkPhysicalDeviceBufferDeviceAddressFeatures bda{};
+        bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES;
+        VkPhysicalDeviceRayQueryFeaturesKHR rq{};
+        rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR;
+        rq.pNext = &bda;
+        VkPhysicalDeviceAccelerationStructureFeaturesKHR as{};
+        as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR;
+        as.pNext = &rq;
+        VkPhysicalDeviceFeatures2 f2{};
+        f2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+        f2.pNext = &as;
+        vkGetPhysicalDeviceFeatures2(dev, &f2);
+        if (!as.accelerationStructure || !rq.rayQuery || !bda.bufferDeviceAddress) continue;
+
+        std::uint32_t q_count = 0;
+        vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, nullptr);
+        std::vector<VkQueueFamilyProperties> qs(q_count);
+        vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, qs.data());
+        for (std::uint32_t i = 0; i < q_count; ++i) {
+            if (qs[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
+                g.physical = dev;
+                g.queue_family = i;
+                vkGetPhysicalDeviceMemoryProperties(dev, &g.memory_props);
+                return true;
+            }
+        }
+    }
+
+    error = "no Vulkan device with acceleration_structure + ray_query + buffer_device_address + compute queue found";
+    return false;
+}
+
+static bool create_device(std::string &error) {
+    float priority = 1.0f;
+    VkDeviceQueueCreateInfo qci{};
+    qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    qci.queueFamilyIndex = g.queue_family;
+    qci.queueCount = 1;
+    qci.pQueuePriorities = &priority;
+
+    VkPhysicalDeviceBufferDeviceAddressFeatures bda{};
+    bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES;
+    bda.bufferDeviceAddress = VK_TRUE;
+
+    VkPhysicalDeviceRayQueryFeaturesKHR rq{};
+    rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR;
+    rq.rayQuery = VK_TRUE;
+    rq.pNext = &bda;
+
+    VkPhysicalDeviceAccelerationStructureFeaturesKHR as{};
+    as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR;
+    as.accelerationStructure = VK_TRUE;
+    as.pNext = &rq;
+
+    const char *extensions[] = {
+        VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME,
+        VK_KHR_RAY_QUERY_EXTENSION_NAME,
+        VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME,
+        VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME,
+    };
+
+    VkDeviceCreateInfo dci{};
+    dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    dci.pNext = &as;
+    dci.queueCreateInfoCount = 1;
+    dci.pQueueCreateInfos = &qci;
+    dci.enabledExtensionCount = static_cast<std::uint32_t>(sizeof(extensions) / sizeof(extensions[0]));
+    dci.ppEnabledExtensionNames = extensions;
+
+    if (!check(vkCreateDevice(g.physical, &dci, nullptr, &g.device), "vkCreateDevice", error)) return false;
+    vkGetDeviceQueue(g.device, g.queue_family, 0, &g.queue);
+
+#define LOAD_DEVICE_PROC(name) \
+    g.name##_ = reinterpret_cast<PFN_##name>(vkGetDeviceProcAddr(g.device, #name)); \
+    if (!g.name##_) { error = "missing device proc " #name; return false; }
+    LOAD_DEVICE_PROC(vkGetBufferDeviceAddressKHR);
+    LOAD_DEVICE_PROC(vkCreateAccelerationStructureKHR);
+    LOAD_DEVICE_PROC(vkDestroyAccelerationStructureKHR);
+    LOAD_DEVICE_PROC(vkGetAccelerationStructureBuildSizesKHR);
+    LOAD_DEVICE_PROC(vkCmdBuildAccelerationStructuresKHR);
+    LOAD_DEVICE_PROC(vkGetAccelerationStructureDeviceAddressKHR);
+#undef LOAD_DEVICE_PROC
+
+    VkCommandPoolCreateInfo pci{};
+    pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+    pci.queueFamilyIndex = g.queue_family;
+    pci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+    if (!check(vkCreateCommandPool(g.device, &pci, nullptr, &g.command_pool), "vkCreateCommandPool", error)) return false;
+
+    VkCommandBufferAllocateInfo cai{};
+    cai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+    cai.commandPool = g.command_pool;
+    cai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+    cai.commandBufferCount = 1;
+    if (!check(vkAllocateCommandBuffers(g.device, &cai, &g.command_buffer), "vkAllocateCommandBuffers", error)) return false;
+
+    return true;
+}
+
+static bool gather_geometry(const mbsp_t *bsp, std::vector<vertex_t> &vertices, std::vector<std::uint32_t> &indices, std::string &error) {
+    vertices.clear();
+    indices.clear();
+
+    const auto &faces = ShadowCastingSolidFacesSet();
+    if (faces.empty()) {
+        error = "no shadow-casting solid faces found for GPU BLAS; call Embree_TraceInit before GPU_TraceInit";
+        return false;
+    }
+
+    for (const mface_t *face : faces) {
+        if (!face || face->numedges < 3) continue;
+        const modelinfo_t *modelinfo = ModelInfoForFace(bsp, Face_GetNum(bsp, face));
+        if (!modelinfo) continue;
+
+        for (int j = 2; j < face->numedges; ++j) {
+            const int v0 = Face_VertexAtIndex(bsp, face, j - 1);
+            const int v1 = Face_VertexAtIndex(bsp, face, j);
+            const int v2 = Face_VertexAtIndex(bsp, face, 0);
+            const qvec3f p0 = Vertex_GetPos(bsp, v0) + modelinfo->offset;
+            const qvec3f p1 = Vertex_GetPos(bsp, v1) + modelinfo->offset;
+            const qvec3f p2 = Vertex_GetPos(bsp, v2) + modelinfo->offset;
+
+            const std::uint32_t base = static_cast<std::uint32_t>(vertices.size());
+            vertices.push_back({p0[0], p0[1], p0[2]});
+            vertices.push_back({p1[0], p1[1], p1[2]});
+            vertices.push_back({p2[0], p2[1], p2[2]});
+            indices.push_back(base + 0);
+            indices.push_back(base + 1);
+            indices.push_back(base + 2);
+        }
+    }
+
+    if (indices.empty()) {
+        error = "GPU geometry gather produced zero triangles";
+        return false;
+    }
+    return true;
+}
+
+static bool create_acceleration_structure(VkAccelerationStructureTypeKHR type, VkDeviceSize size, as_t &out, std::string &error) {
+    if (!create_buffer(size,
+            VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+            out.storage,
+            error)) return false;
+
+    VkAccelerationStructureCreateInfoKHR ci{};
+    ci.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR;
+    ci.type = type;
+    ci.size = size;
+    ci.buffer = out.storage.buffer;
+    if (!check(g.vkCreateAccelerationStructureKHR_(g.device, &ci, nullptr, &out.as), "vkCreateAccelerationStructureKHR", error)) return false;
+
+    VkAccelerationStructureDeviceAddressInfoKHR ai{};
+    ai.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_DEVICE_ADDRESS_INFO_KHR;
+    ai.accelerationStructure = out.as;
+    out.address = g.vkGetAccelerationStructureDeviceAddressKHR_(g.device, &ai);
+    return true;
+}
+
+static bool build_blas(const std::vector<vertex_t> &vertices, const std::vector<std::uint32_t> &indices, std::string &error) {
+    if (!create_buffer(sizeof(vertex_t) * vertices.size(),
+            VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+            g.vertices,
+            error,
+            vertices.data())) return false;
+
+    if (!create_buffer(sizeof(std::uint32_t) * indices.size(),
+            VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+            g.indices,
+            error,
+            indices.data())) return false;
+
+    VkDeviceAddress vertex_addr = buffer_address(g.vertices);
+    VkDeviceAddress index_addr = buffer_address(g.indices);
+
+    VkAccelerationStructureGeometryKHR geom{};
+    geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR;
+    geom.geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
+    geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR;
+    geom.geometry.triangles.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_TRIANGLES_DATA_KHR;
+    geom.geometry.triangles.vertexFormat = VK_FORMAT_R32G32B32_SFLOAT;
+    geom.geometry.triangles.vertexData.deviceAddress = vertex_addr;
+    geom.geometry.triangles.vertexStride = sizeof(vertex_t);
+    geom.geometry.triangles.maxVertex = static_cast<std::uint32_t>(vertices.size() - 1);
+    geom.geometry.triangles.indexType = VK_INDEX_TYPE_UINT32;
+    geom.geometry.triangles.indexData.deviceAddress = index_addr;
+
+    const std::uint32_t prim_count = static_cast<std::uint32_t>(indices.size() / 3);
+    g.triangle_count = prim_count;
+
+    VkAccelerationStructureBuildGeometryInfoKHR build{};
+    build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR;
+    build.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR;
+    build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR;
+    build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR;
+    build.geometryCount = 1;
+    build.pGeometries = &geom;
+
+    VkAccelerationStructureBuildSizesInfoKHR sizes{};
+    sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR;
+    g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes);
+
+    if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR, sizes.accelerationStructureSize, g.blas, error)) return false;
+
+    buffer_t scratch;
+    if (!create_buffer(sizes.buildScratchSize,
+            VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+            scratch,
+            error)) return false;
+
+    build.dstAccelerationStructure = g.blas.as;
+    build.scratchData.deviceAddress = buffer_address(scratch);
+
+    VkAccelerationStructureBuildRangeInfoKHR range{};
+    range.primitiveCount = prim_count;
+    const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = &range;
+
+    bool ok = one_time_submit([&](VkCommandBuffer cmd) {
+        g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr);
+    }, error);
+
+    destroy_buffer(scratch);
+    return ok;
+}
+
+static bool build_tlas(std::string &error) {
+    VkAccelerationStructureInstanceKHR inst{};
+    inst.transform.matrix[0][0] = 1.0f;
+    inst.transform.matrix[1][1] = 1.0f;
+    inst.transform.matrix[2][2] = 1.0f;
+    inst.instanceCustomIndex = 0;
+    inst.mask = 0xff;
+    inst.instanceShaderBindingTableRecordOffset = 0;
+    inst.flags = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR;
+    inst.accelerationStructureReference = g.blas.address;
+
+    if (!create_buffer(sizeof(inst),
+            VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+            g.instances,
+            error,
+            &inst)) return false;
+
+    VkAccelerationStructureGeometryKHR geom{};
+    geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR;
+    geom.geometryType = VK_GEOMETRY_TYPE_INSTANCES_KHR;
+    geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR;
+    geom.geometry.instances.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR;
+    geom.geometry.instances.arrayOfPointers = VK_FALSE;
+    geom.geometry.instances.data.deviceAddress = buffer_address(g.instances);
+
+    const std::uint32_t prim_count = 1;
+
+    VkAccelerationStructureBuildGeometryInfoKHR build{};
+    build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR;
+    build.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR;
+    build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR;
+    build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR;
+    build.geometryCount = 1;
+    build.pGeometries = &geom;
+
+    VkAccelerationStructureBuildSizesInfoKHR sizes{};
+    sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR;
+    g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes);
+
+    if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR, sizes.accelerationStructureSize, g.tlas, error)) return false;
+
+    buffer_t scratch;
+    if (!create_buffer(sizes.buildScratchSize,
+            VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+            scratch,
+            error)) return false;
+
+    build.dstAccelerationStructure = g.tlas.as;
+    build.scratchData.deviceAddress = buffer_address(scratch);
+
+    VkAccelerationStructureBuildRangeInfoKHR range{};
+    range.primitiveCount = prim_count;
+    const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = &range;
+
+    bool ok = one_time_submit([&](VkCommandBuffer cmd) {
+        g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr);
+    }, error);
+
+    destroy_buffer(scratch);
+    return ok;
+}
+
+static std::filesystem::path exe_dir() {
+#if defined(__linux__)
+    std::array<char, PATH_MAX> buf{};
+    ssize_t len = readlink("/proc/self/exe", buf.data(), buf.size() - 1);
+    if (len > 0) {
+        buf[static_cast<std::size_t>(len)] = '\0';
+        return std::filesystem::path(buf.data()).parent_path();
+    }
+#endif
+    return std::filesystem::current_path();
+}
+
+static bool read_file(const std::filesystem::path &path, std::vector<std::uint32_t> &words, std::string &error) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (!f) { error = "could not open shader: " + path.string(); return false; }
+    const std::streamsize size = f.tellg();
+    if (size <= 0 || (size % 4) != 0) { error = "shader has invalid SPIR-V size: " + path.string(); return false; }
+    f.seekg(0, std::ios::beg);
+    words.resize(static_cast<std::size_t>(size / 4));
+    if (!f.read(reinterpret_cast<char *>(words.data()), size)) { error = "failed to read shader: " + path.string(); return false; }
+    return true;
+}
+
+static bool create_pipeline(std::string &error) {
+    VkDescriptorSetLayoutBinding b0{};
+    b0.binding = 0;
+    b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+    b0.descriptorCount = 1;
+    b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b1{};
+    b1.binding = 1;
+    b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b1.descriptorCount = 1;
+    b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b2{};
+    b2.binding = 2;
+    b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b2.descriptorCount = 1;
+    b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    std::array<VkDescriptorSetLayoutBinding, 3> bindings{b0, b1, b2};
+    VkDescriptorSetLayoutCreateInfo dlci{};
+    dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    dlci.bindingCount = static_cast<std::uint32_t>(bindings.size());
+    dlci.pBindings = bindings.data();
+    if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.descriptor_set_layout), "vkCreateDescriptorSetLayout", error)) return false;
+
+    VkPushConstantRange pcr{};
+    pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    pcr.offset = 0;
+    pcr.size = sizeof(push_constants_t);
+
+    VkPipelineLayoutCreateInfo plci{};
+    plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    plci.setLayoutCount = 1;
+    plci.pSetLayouts = &g.descriptor_set_layout;
+    plci.pushConstantRangeCount = 1;
+    plci.pPushConstantRanges = &pcr;
+    if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.pipeline_layout), "vkCreatePipelineLayout", error)) return false;
+
+    std::vector<std::uint32_t> spv;
+    const auto shader_path = exe_dir() / "gpu_shaders" / "occlusion.comp.spv";
+    if (!read_file(shader_path, spv, error)) return false;
+
+    VkShaderModuleCreateInfo smci{};
+    smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    smci.codeSize = spv.size() * sizeof(std::uint32_t);
+    smci.pCode = spv.data();
+    VkShaderModule shader = VK_NULL_HANDLE;
+    if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule", error)) return false;
+
+    VkComputePipelineCreateInfo cpci{};
+    cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    cpci.stage.module = shader;
+    cpci.stage.pName = "main";
+    cpci.layout = g.pipeline_layout;
+    bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.pipeline), "vkCreateComputePipelines", error);
+    vkDestroyShaderModule(g.device, shader, nullptr);
+    if (!ok) return false;
+
+    VkDescriptorPoolSize ps0{};
+    ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+    ps0.descriptorCount = 1;
+    VkDescriptorPoolSize ps1{};
+    ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    ps1.descriptorCount = 2;
+    std::array<VkDescriptorPoolSize, 2> sizes{ps0, ps1};
+
+    VkDescriptorPoolCreateInfo dpci{};
+    dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    dpci.maxSets = 1;
+    dpci.poolSizeCount = static_cast<std::uint32_t>(sizes.size());
+    dpci.pPoolSizes = sizes.data();
+    if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.descriptor_pool), "vkCreateDescriptorPool", error)) return false;
+
+    VkDescriptorSetAllocateInfo dsai{};
+    dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    dsai.descriptorPool = g.descriptor_pool;
+    dsai.descriptorSetCount = 1;
+    dsai.pSetLayouts = &g.descriptor_set_layout;
+    if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.descriptor_set), "vkAllocateDescriptorSets", error)) return false;
+
+    return true;
+}
+
+static bool create_direct_pipeline(std::string &error) {
+    VkDescriptorSetLayoutBinding b0{};
+    b0.binding = 0;
+    b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+    b0.descriptorCount = 1;
+    b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b1{};
+    b1.binding = 1;
+    b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b1.descriptorCount = 1;
+    b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b2{};
+    b2.binding = 2;
+    b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b2.descriptorCount = 1;
+    b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b3{};
+    b3.binding = 3;
+    b3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b3.descriptorCount = 1;
+    b3.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    std::array<VkDescriptorSetLayoutBinding, 4> bindings{b0, b1, b2, b3};
+    VkDescriptorSetLayoutCreateInfo dlci{};
+    dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    dlci.bindingCount = static_cast<std::uint32_t>(bindings.size());
+    dlci.pBindings = bindings.data();
+    if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.direct_descriptor_set_layout), "vkCreateDescriptorSetLayout(direct)", error)) return false;
+
+    VkPushConstantRange pcr{};
+    pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    pcr.offset = 0;
+    pcr.size = sizeof(direct_push_constants_t);
+
+    VkPipelineLayoutCreateInfo plci{};
+    plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    plci.setLayoutCount = 1;
+    plci.pSetLayouts = &g.direct_descriptor_set_layout;
+    plci.pushConstantRangeCount = 1;
+    plci.pPushConstantRanges = &pcr;
+    if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.direct_pipeline_layout), "vkCreatePipelineLayout(direct)", error)) return false;
+
+    std::vector<std::uint32_t> spv;
+    const auto shader_path = exe_dir() / "gpu_shaders" / "direct_phase.comp.spv";
+    if (!read_file(shader_path, spv, error)) return false;
+
+    VkShaderModuleCreateInfo smci{};
+    smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    smci.codeSize = spv.size() * sizeof(std::uint32_t);
+    smci.pCode = spv.data();
+    VkShaderModule shader = VK_NULL_HANDLE;
+    if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule(direct)", error)) return false;
+
+    VkComputePipelineCreateInfo cpci{};
+    cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    cpci.stage.module = shader;
+    cpci.stage.pName = "main";
+    cpci.layout = g.direct_pipeline_layout;
+    bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.direct_pipeline), "vkCreateComputePipelines(direct)", error);
+    vkDestroyShaderModule(g.device, shader, nullptr);
+    if (!ok) return false;
+
+    VkDescriptorPoolSize ps0{};
+    ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+    ps0.descriptorCount = 1;
+    VkDescriptorPoolSize ps1{};
+    ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    ps1.descriptorCount = 3;
+    std::array<VkDescriptorPoolSize, 2> sizes{ps0, ps1};
+
+    VkDescriptorPoolCreateInfo dpci{};
+    dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+    dpci.maxSets = 1;
+    dpci.poolSizeCount = static_cast<std::uint32_t>(sizes.size());
+    dpci.pPoolSizes = sizes.data();
+    if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.direct_descriptor_pool), "vkCreateDescriptorPool(direct)", error)) return false;
+
+    VkDescriptorSetAllocateInfo dsai{};
+    dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+    dsai.descriptorPool = g.direct_descriptor_pool;
+    dsai.descriptorSetCount = 1;
+    dsai.pSetLayouts = &g.direct_descriptor_set_layout;
+    if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.direct_descriptor_set), "vkAllocateDescriptorSets(direct)", error)) return false;
+
+    return true;
+}
+
+static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffer_t &range_buffer, const buffer_t &accum_buffer) {
+    VkWriteDescriptorSetAccelerationStructureKHR as_info{};
+    as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR;
+    as_info.accelerationStructureCount = 1;
+    as_info.pAccelerationStructures = &g.tlas.as;
+
+    VkWriteDescriptorSet w0{};
+    w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w0.pNext = &as_info;
+    w0.dstSet = g.direct_descriptor_set;
+    w0.dstBinding = 0;
+    w0.descriptorCount = 1;
+    w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+
+    VkDescriptorBufferInfo job_info{};
+    job_info.buffer = job_buffer.buffer;
+    job_info.offset = 0;
+    job_info.range = job_buffer.size;
+    VkWriteDescriptorSet w1{};
+    w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w1.dstSet = g.direct_descriptor_set;
+    w1.dstBinding = 1;
+    w1.descriptorCount = 1;
+    w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w1.pBufferInfo = &job_info;
+
+    VkDescriptorBufferInfo range_info{};
+    range_info.buffer = range_buffer.buffer;
+    range_info.offset = 0;
+    range_info.range = range_buffer.size;
+    VkWriteDescriptorSet w2{};
+    w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w2.dstSet = g.direct_descriptor_set;
+    w2.dstBinding = 2;
+    w2.descriptorCount = 1;
+    w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w2.pBufferInfo = &range_info;
+
+    VkDescriptorBufferInfo accum_info{};
+    accum_info.buffer = accum_buffer.buffer;
+    accum_info.offset = 0;
+    accum_info.range = accum_buffer.size;
+    VkWriteDescriptorSet w3{};
+    w3.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w3.dstSet = g.direct_descriptor_set;
+    w3.dstBinding = 3;
+    w3.descriptorCount = 1;
+    w3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w3.pBufferInfo = &accum_info;
+
+    std::array<VkWriteDescriptorSet, 4> writes{w0, w1, w2, w3};
+    vkUpdateDescriptorSets(g.device, static_cast<std::uint32_t>(writes.size()), writes.data(), 0, nullptr);
+}
+
+static void update_descriptor_set(const buffer_t &ray_buffer, const buffer_t &result_buffer) {
+    VkWriteDescriptorSetAccelerationStructureKHR as_info{};
+    as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR;
+    as_info.accelerationStructureCount = 1;
+    as_info.pAccelerationStructures = &g.tlas.as;
+
+    VkWriteDescriptorSet w0{};
+    w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w0.pNext = &as_info;
+    w0.dstSet = g.descriptor_set;
+    w0.dstBinding = 0;
+    w0.descriptorCount = 1;
+    w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
+
+    VkDescriptorBufferInfo ray_info{};
+    ray_info.buffer = ray_buffer.buffer;
+    ray_info.offset = 0;
+    ray_info.range = ray_buffer.size;
+    VkWriteDescriptorSet w1{};
+    w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w1.dstSet = g.descriptor_set;
+    w1.dstBinding = 1;
+    w1.descriptorCount = 1;
+    w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w1.pBufferInfo = &ray_info;
+
+    VkDescriptorBufferInfo result_info{};
+    result_info.buffer = result_buffer.buffer;
+    result_info.offset = 0;
+    result_info.range = result_buffer.size;
+    VkWriteDescriptorSet w2{};
+    w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w2.dstSet = g.descriptor_set;
+    w2.dstBinding = 2;
+    w2.descriptorCount = 1;
+    w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w2.pBufferInfo = &result_info;
+
+    std::array<VkWriteDescriptorSet, 3> writes{w0, w1, w2};
+    vkUpdateDescriptorSets(g.device, static_cast<std::uint32_t>(writes.size()), writes.data(), 0, nullptr);
+}
+
+} // namespace
+
+bool init(const mbsp_t *bsp, std::string &error) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    destroy_locked();
+
+    if (!create_instance(error)) { destroy_locked(); return false; }
+    if (!pick_device(error)) { destroy_locked(); return false; }
+    if (!create_device(error)) { destroy_locked(); return false; }
+
+    g.has_filtered_embree_geometry = !filtergeom.triInfo.empty();
+    if (g.has_filtered_embree_geometry) {
+        logging::print("GPU light: filtered Embree geometry exists ({} tris); GPU will fall back for correctness.\n", filtergeom.triInfo.size());
+    }
+
+    std::vector<vertex_t> vertices;
+    std::vector<std::uint32_t> indices;
+    if (!gather_geometry(bsp, vertices, indices, error)) { destroy_locked(); return false; }
+    if (!build_blas(vertices, indices, error)) { destroy_locked(); return false; }
+    if (!build_tlas(error)) { destroy_locked(); return false; }
+    if (!create_pipeline(error)) { destroy_locked(); return false; }
+    if (!create_direct_pipeline(error)) { destroy_locked(); return false; }
+
+    logging::print("GPU light: Vulkan ray-query BLAS/TLAS ready ({} opaque triangles).\n", g.triangle_count);
+    return true;
+}
+
+void shutdown() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    destroy_locked();
+}
+
+bool trace_occlusion_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const gpu_light::ray_t *rays,
+    gpu_light::occlusion_result_t *results,
+    std::size_t count,
+    std::string &error) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    if (!g.device || !g.pipeline || !g.tlas.as) {
+        error = "Vulkan GPU backend is not initialized";
+        return false;
+    }
+
+    // Correctness guard: the GPU fast path only contains opaque solid/default geometry.
+    // If Embree has filtered geometry, let CPU handle batches so glass/fence/dynamic/channel filters remain correct.
+    if (g.has_filtered_embree_geometry) {
+        return false;
+    }
+    if (shadow_mask != CHANNEL_MASK_DEFAULT) {
+        return false;
+    }
+    (void)self;
+
+    std::vector<gpu_ray_host_t> gpu_rays(count);
+    for (std::size_t i = 0; i < count; ++i) {
+        gpu_rays[i].ox = rays[i].origin[0];
+        gpu_rays[i].oy = rays[i].origin[1];
+        gpu_rays[i].oz = rays[i].origin[2];
+        gpu_rays[i].tmin = rays[i].tmin;
+        gpu_rays[i].dx = rays[i].direction[0];
+        gpu_rays[i].dy = rays[i].direction[1];
+        gpu_rays[i].dz = rays[i].direction[2];
+        gpu_rays[i].tmax = rays[i].tmax;
+        gpu_rays[i].shadow_mask = rays[i].shadow_mask;
+        gpu_rays[i].user_index = rays[i].user_index;
+    }
+
+    buffer_t ray_buffer;
+    buffer_t result_buffer;
+    std::vector<gpu_result_host_t> zero_results(count);
+
+    bool ok = create_buffer(sizeof(gpu_ray_host_t) * count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        ray_buffer,
+        error,
+        gpu_rays.data());
+    if (!ok) return false;
+
+    ok = create_buffer(sizeof(gpu_result_host_t) * count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        result_buffer,
+        error,
+        zero_results.data());
+    if (!ok) {
+        destroy_buffer(ray_buffer);
+        return false;
+    }
+
+    update_descriptor_set(ray_buffer, result_buffer);
+
+    push_constants_t pc{};
+    pc.ray_count = static_cast<std::uint32_t>(count);
+    pc.flags = 0;
+
+    ok = one_time_submit([&](VkCommandBuffer cmd) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline);
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline_layout, 0, 1, &g.descriptor_set, 0, nullptr);
+        vkCmdPushConstants(cmd, g.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+        vkCmdDispatch(cmd, (pc.ray_count + 127u) / 128u, 1, 1);
+    }, error);
+
+    if (ok) {
+        void *mapped = nullptr;
+        ok = check(vkMapMemory(g.device, result_buffer.memory, 0, result_buffer.size, 0, &mapped), "vkMapMemory(result)", error);
+        if (ok) {
+            const auto *gpu_results = static_cast<const gpu_result_host_t *>(mapped);
+            for (std::size_t i = 0; i < count; ++i) {
+                results[i].occluded = gpu_results[i].occluded;
+                results[i].reserved0 = gpu_results[i].reserved0;
+                results[i].transmittance[0] = gpu_results[i].tr;
+                results[i].transmittance[1] = gpu_results[i].tg;
+                results[i].transmittance[2] = gpu_results[i].tb;
+            }
+            vkUnmapMemory(g.device, result_buffer.memory);
+        }
+    }
+
+    destroy_buffer(result_buffer);
+    destroy_buffer(ray_buffer);
+    return ok;
+}
+
+
+bool trace_direct_phase_batch(
+    const gpu_light::direct_phase_source_t *sources,
+    std::size_t source_count,
+    const gpu_light::direct_phase_sample_t *samples,
+    gpu_light::direct_phase_accum_t *accum,
+    std::size_t sample_count,
+    std::string &error) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    if (!g.device || !g.direct_pipeline || !g.tlas.as) {
+        error = "Vulkan GPU direct phase backend is not initialized";
+        return false;
+    }
+    if (g.has_filtered_embree_geometry) {
+        return false;
+    }
+    if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) {
+        return true;
+    }
+
+    std::vector<gpu_direct_phase_sample_host_t> gpu_samples(sample_count);
+    for (std::size_t i = 0; i < sample_count; ++i) {
+        gpu_samples[i].px = samples[i].px;
+        gpu_samples[i].py = samples[i].py;
+        gpu_samples[i].pz = samples[i].pz;
+        gpu_samples[i].occlusion = samples[i].occlusion;
+        gpu_samples[i].nx = samples[i].nx;
+        gpu_samples[i].ny = samples[i].ny;
+        gpu_samples[i].nz = samples[i].nz;
+        gpu_samples[i].twosided = samples[i].twosided;
+    }
+
+    std::vector<gpu_direct_phase_source_host_t> gpu_sources(source_count);
+    for (std::size_t i = 0; i < source_count; ++i) {
+        gpu_sources[i].px = sources[i].px;
+        gpu_sources[i].py = sources[i].py;
+        gpu_sources[i].pz = sources[i].pz;
+        gpu_sources[i].light = sources[i].light;
+        gpu_sources[i].dx = sources[i].dx;
+        gpu_sources[i].dy = sources[i].dy;
+        gpu_sources[i].dz = sources[i].dz;
+        gpu_sources[i].dist = sources[i].dist;
+        gpu_sources[i].cr = sources[i].cr;
+        gpu_sources[i].cg = sources[i].cg;
+        gpu_sources[i].cb = sources[i].cb;
+        gpu_sources[i].atten = sources[i].atten;
+        gpu_sources[i].type = sources[i].type;
+        gpu_sources[i].formula = sources[i].formula;
+        gpu_sources[i].flags = sources[i].flags;
+        gpu_sources[i].reserved0 = 0;
+        gpu_sources[i].anglescale = sources[i].anglescale;
+        gpu_sources[i].dirt = sources[i].dirt;
+        gpu_sources[i].falloff = sources[i].falloff;
+        gpu_sources[i].pad0 = 0.0f;
+    }
+
+    std::vector<gpu_direct_accum_host_t> zero_accum(sample_count);
+
+    buffer_t sample_buffer;
+    buffer_t source_buffer;
+    buffer_t accum_buffer;
+
+    bool ok = create_buffer(sizeof(gpu_direct_phase_sample_host_t) * sample_count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        sample_buffer,
+        error,
+        gpu_samples.data());
+    if (!ok) return false;
+
+    ok = create_buffer(sizeof(gpu_direct_phase_source_host_t) * source_count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        source_buffer,
+        error,
+        gpu_sources.data());
+    if (!ok) {
+        destroy_buffer(sample_buffer);
+        return false;
+    }
+
+    ok = create_buffer(sizeof(gpu_direct_accum_host_t) * sample_count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        accum_buffer,
+        error,
+        zero_accum.data());
+    if (!ok) {
+        destroy_buffer(source_buffer);
+        destroy_buffer(sample_buffer);
+        return false;
+    }
+
+    update_direct_descriptor_set(sample_buffer, source_buffer, accum_buffer);
+
+    direct_push_constants_t pc{};
+    pc.sample_count = static_cast<std::uint32_t>(sample_count);
+    pc.source_count = static_cast<std::uint32_t>(source_count);
+    pc.flags = 0;
+    pc.reserved0 = 0;
+
+    ok = one_time_submit([&](VkCommandBuffer cmd) {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline);
+        vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline_layout, 0, 1, &g.direct_descriptor_set, 0, nullptr);
+        vkCmdPushConstants(cmd, g.direct_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+        vkCmdDispatch(cmd, (pc.sample_count + 63u) / 64u, 1, 1);
+    }, error);
+
+    if (ok) {
+        void *mapped = nullptr;
+        ok = check(vkMapMemory(g.device, accum_buffer.memory, 0, accum_buffer.size, 0, &mapped), "vkMapMemory(direct phase accum)", error);
+        if (ok) {
+            const auto *gpu_accum = static_cast<const gpu_direct_accum_host_t *>(mapped);
+            for (std::size_t i = 0; i < sample_count; ++i) {
+                accum[i].cr = gpu_accum[i].cr;
+                accum[i].cg = gpu_accum[i].cg;
+                accum[i].cb = gpu_accum[i].cb;
+                accum[i].pad0 = 0.0f;
+                accum[i].nr = gpu_accum[i].nr;
+                accum[i].ng = gpu_accum[i].ng;
+                accum[i].nb = gpu_accum[i].nb;
+                accum[i].pad1 = 0.0f;
+                accum[i].hit = gpu_accum[i].hit;
+                accum[i].reserved0 = 0;
+                accum[i].reserved1 = 0;
+                accum[i].reserved2 = 0;
+            }
+            vkUnmapMemory(g.device, accum_buffer.memory);
+        }
+    }
+
+    destroy_buffer(accum_buffer);
+    destroy_buffer(source_buffer);
+    destroy_buffer(sample_buffer);
+    return ok;
+}
+
+bool trace_direct_accumulate_batch(
+    const modelinfo_t *self,
+    std::uint32_t shadow_mask,
+    const gpu_light::direct_job_t *jobs,
+    std::size_t job_count,
+    const gpu_light::direct_sample_range_t *ranges,
+    gpu_light::direct_accum_t *accum,
+    std::size_t sample_count,
+    std::string &error) {
+    (void)self;
+    (void)shadow_mask;
+    (void)jobs;
+    (void)job_count;
+    (void)ranges;
+    (void)accum;
+    (void)sample_count;
+    error = "old direct job buffer path disabled in v5; use trace_direct_phase_batch";
+    return false;
+}
+
+} // namespace gpu_light::vulkan_backend
+
+#endif // HAVE_GPU_LIGHT

From dd771a8b174bfc5964dbbfa775f9e1d26a042068 Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 03:57:28 +0900
Subject: [PATCH 2/7]  clean up comments

---
 include/light/trace_gpu.hh | 3 +--
 light/trace_gpu.cc         | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh
index a0ecfb51..f223bb90 100644
--- a/include/light/trace_gpu.hh
+++ b/include/light/trace_gpu.hh
@@ -130,11 +130,10 @@ bool trace_direct_accumulate_batch(
 
 } // namespace gpu_light
 
-// C-style wrappers are easier to call from older code paths.
 bool GPU_TraceInit(const mbsp_t *bsp);
 void GPU_TraceShutdown();
 bool GPU_TraceAvailable();
 const char *GPU_TraceLastError();
 
-// Implemented in light/ltface.cc by the v5 overlay; flushes pending sample-driven direct-light work.
+// Flushes pending sample-driven direct-light work.
 void GPU_DirectQueue_Flush(const mbsp_t *bsp);
diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc
index e20a33fd..71a6e8e8 100644
--- a/light/trace_gpu.cc
+++ b/light/trace_gpu.cc
@@ -45,8 +45,7 @@ stats_t g_stats;
 } // namespace
 
 bool requested() {
-    // The apply script wires this to light_options.gpu in the call site. Keeping
-    // this function independent avoids pulling all light settings into this TU.
+    // Keeping this function independent avoids pulling all light settings into this TU.
     return true;
 }
 

From 10e154b9880ecc7978650a25354b17e5c8cf8bd5 Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 04:19:02 +0900
Subject: [PATCH 3/7] Enhance GPU direct phase processing with additional
 structures and Vulkan integration

- Added  and reserved fields to  and  for better tracking of face data.
- Introduced  and  structures to manage source ranges.
- Updated  function signatures to include new parameters for face ranges and source indices.
- Modified Vulkan descriptor set and buffer updates to accommodate new data structures.
- Improved shader code to utilize face range data for more efficient light processing.
---
 include/light/trace_gpu.hh          |  15 +-
 light/gpu_shaders/direct_phase.comp |  60 ++++---
 light/ltface.cc                     | 240 +++++++++++++++++++++++++---
 light/trace_gpu.cc                  |  24 ++-
 light/trace_gpu_vulkan.cc           | 143 ++++++++++++++---
 5 files changed, 418 insertions(+), 64 deletions(-)

diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh
index f223bb90..96ac2f85 100644
--- a/include/light/trace_gpu.hh
+++ b/include/light/trace_gpu.hh
@@ -61,6 +61,15 @@ struct direct_accum_t {
 struct direct_phase_sample_t {
     float px = 0, py = 0, pz = 0, occlusion = 1;
     float nx = 0, ny = 0, nz = 1, twosided = 0;
+    std::uint32_t face_index = 0;
+    std::uint32_t reserved0 = 0;
+    std::uint32_t reserved1 = 0;
+    std::uint32_t reserved2 = 0;
+};
+
+struct direct_phase_face_range_t {
+    std::uint32_t source_begin = 0;
+    std::uint32_t source_count = 0;
 };
 
 struct direct_phase_source_t {
@@ -117,7 +126,11 @@ bool trace_direct_phase_batch(
     std::size_t source_count,
     const direct_phase_sample_t *samples,
     direct_phase_accum_t *accum,
-    std::size_t sample_count);
+    std::size_t sample_count,
+    const direct_phase_face_range_t *face_ranges,
+    std::size_t face_range_count,
+    const std::uint32_t *face_source_indices,
+    std::size_t face_source_index_count);
 
 bool trace_direct_accumulate_batch(
     const modelinfo_t *self,
diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp
index 663ba73f..43dd6b51 100644
--- a/light/gpu_shaders/direct_phase.comp
+++ b/light/gpu_shaders/direct_phase.comp
@@ -9,6 +9,10 @@ struct GpuDirectPhaseSample {
     float occlusion;
     vec3 normal;
     float twosided;
+    uint faceIndex;
+    uint reserved0;
+    uint reserved1;
+    uint reserved2;
 };
 
 struct GpuDirectPhaseSource {
@@ -28,6 +32,11 @@ struct GpuDirectPhaseSource {
     float pad0;
 };
 
+struct GpuDirectPhaseFaceRange {
+    uint sourceBegin;
+    uint sourceCount;
+};
+
 struct GpuDirectAccum {
     vec3 color;
     float pad0;
@@ -42,7 +51,9 @@ struct GpuDirectAccum {
 layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS;
 layout(std430, set = 0, binding = 1) readonly buffer Samples { GpuDirectPhaseSample samples[]; } sampleBuffer;
 layout(std430, set = 0, binding = 2) readonly buffer Sources { GpuDirectPhaseSource sources[]; } sourceBuffer;
-layout(std430, set = 0, binding = 3) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer;
+layout(std430, set = 0, binding = 3) readonly buffer FaceRanges { GpuDirectPhaseFaceRange ranges[]; } faceRangeBuffer;
+layout(std430, set = 0, binding = 4) readonly buffer FaceSourceIndices { uint indices[]; } faceSourceIndexBuffer;
+layout(std430, set = 0, binding = 5) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer;
 
 layout(push_constant) uniform PushConstants {
     uint sampleCount;
@@ -72,30 +83,37 @@ float point_light_value(uint formula, float light, float atten, float dist, floa
     float d = max(dist, 1.0);
     float a = max(atten, 0.0001);
 
-    // Mirrors the broad ericw-tools delay/formula families well enough for the
-    // experimental GPU fast path. Exact exotic cases should stay on CPU.
-    if (formula == 1u) {              // LF_INVERSE
+    if (formula == 1u) {
         return light * 128.0 / (d * a);
-    } else if (formula == 2u) {       // LF_INVERSE2
+    } else if (formula == 2u) {
         return light * 128.0 * 128.0 / (d * d * a);
-    } else if (formula == 3u) {       // LF_INFINITE
+    } else if (formula == 3u) {
         return light;
-    } else if (formula == 5u) {       // LF_INVERSE2A
+    } else if (formula == 5u) {
         float da = d + 128.0;
         return light * 128.0 * 128.0 / (da * da * a);
-    } else if (formula == 6u) {       // LF_QRAD3-ish
+    } else if (formula == 6u) {
         float qd = max(d, 16.0);
         return light * 128.0 * 128.0 / (qd * qd * a);
     }
 
-    // LF_LINEAR. If _falloff is set, use it as the zero point. Otherwise the
-    // classic formula is light - distance * attenuation.
     if (falloff > 0.0) {
         return light * max(0.0, 1.0 - d / falloff);
     }
     return light - d * a;
 }
 
+void clear_accum(uint sample_id) {
+    accumBuffer.accum[sample_id].color = vec3(0.0);
+    accumBuffer.accum[sample_id].pad0 = 0.0;
+    accumBuffer.accum[sample_id].normal = vec3(0.0);
+    accumBuffer.accum[sample_id].pad1 = 0.0;
+    accumBuffer.accum[sample_id].hit = 0u;
+    accumBuffer.accum[sample_id].reserved0 = 0u;
+    accumBuffer.accum[sample_id].reserved1 = 0u;
+    accumBuffer.accum[sample_id].reserved2 = 0u;
+}
+
 void main() {
     uint sample_id = gl_GlobalInvocationID.x;
     if (sample_id >= pc.sampleCount) {
@@ -104,21 +122,25 @@ void main() {
 
     GpuDirectPhaseSample s = sampleBuffer.samples[sample_id];
     if (s.twosided < -0.5) {
-        accumBuffer.accum[sample_id].color = vec3(0.0);
-        accumBuffer.accum[sample_id].pad0 = 0.0;
-        accumBuffer.accum[sample_id].normal = vec3(0.0);
-        accumBuffer.accum[sample_id].pad1 = 0.0;
-        accumBuffer.accum[sample_id].hit = 0u;
-        accumBuffer.accum[sample_id].reserved0 = 0u;
-        accumBuffer.accum[sample_id].reserved1 = 0u;
-        accumBuffer.accum[sample_id].reserved2 = 0u;
+        clear_accum(sample_id);
         return;
     }
+
+    GpuDirectPhaseFaceRange r = faceRangeBuffer.ranges[s.faceIndex];
+    if (r.sourceCount == 0u) {
+        clear_accum(sample_id);
+        return;
+    }
+
     vec3 total_color = vec3(0.0);
     vec3 total_normal = vec3(0.0);
     uint any_hit = 0u;
 
-    for (uint source_id = 0u; source_id < pc.sourceCount; ++source_id) {
+    for (uint local_i = 0u; local_i < r.sourceCount; ++local_i) {
+        uint source_id = faceSourceIndexBuffer.indices[r.sourceBegin + local_i];
+        if (source_id >= pc.sourceCount) {
+            continue;
+        }
         GpuDirectPhaseSource l = sourceBuffer.sources[source_id];
 
         vec3 ray_dir;
diff --git a/light/ltface.cc b/light/ltface.cc
index fc7c0197..baddf249 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -18,6 +18,7 @@
 */
 
 #include <light/ltface.hh>
+#include <limits>
 #include <chrono>
 #include <cstdint>
 #include <mutex>
@@ -2565,14 +2566,7 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const
 #if defined(HAVE_GPU_LIGHT)
 static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps)
 {
-    // v5 disabled: per-face GPU direct was slower than Embree.
-    return false;
-
-    // v4 disabled: per-face GPU direct was slower than Embree.
-    return false;
 
-    // Disabled: this per-face GPU direct path is currently slower than Embree.
-    // It is not the final whole-phase batching architecture.
     return false;
 
     if (!GPU_TraceAvailable()) {
@@ -2757,6 +2751,8 @@ static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, light
 
 
 
+
+
 #if defined(HAVE_GPU_LIGHT)
 namespace {
 struct gpu_direct_face_record_t {
@@ -2769,12 +2765,154 @@ struct gpu_direct_face_record_t {
 std::mutex g_gpu_direct_queue_mutex;
 std::vector<gpu_light::direct_phase_sample_t> g_gpu_direct_samples;
 std::vector<gpu_light::direct_phase_source_t> g_gpu_direct_sources;
+std::vector<gpu_light::direct_phase_face_range_t> g_gpu_direct_face_ranges;
+std::vector<std::uint32_t> g_gpu_direct_face_source_indices;
 std::vector<gpu_direct_face_record_t> g_gpu_direct_faces;
 bool g_gpu_direct_sources_built = false;
 bool g_gpu_direct_disabled = false;
 
 static constexpr std::size_t GPU_DIRECT_FLUSH_SAMPLES = 1024ull * 1024ull;
 
+struct gpu_direct_source_key_t {
+    std::uint32_t type = 0;
+    std::uint32_t formula = 0;
+    std::uint32_t flags = 0;
+    int px = 0, py = 0, pz = 0;
+    int dx = 0, dy = 0, dz = 0;
+    int cr = 0, cg = 0, cb = 0;
+    int light = 0, atten = 0, anglescale = 0, falloff = 0;
+};
+
+static int GPU_Direct_Quantize(float v, float scale = 4096.0f)
+{
+    return static_cast<int>(std::lround(v * scale));
+}
+
+static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phase_source_t &s)
+{
+    gpu_direct_source_key_t k{};
+    k.type = s.type;
+    k.formula = s.formula;
+    k.flags = s.flags;
+    k.px = GPU_Direct_Quantize(s.px);
+    k.py = GPU_Direct_Quantize(s.py);
+    k.pz = GPU_Direct_Quantize(s.pz);
+    k.dx = GPU_Direct_Quantize(s.dx);
+    k.dy = GPU_Direct_Quantize(s.dy);
+    k.dz = GPU_Direct_Quantize(s.dz);
+    k.cr = GPU_Direct_Quantize(s.cr);
+    k.cg = GPU_Direct_Quantize(s.cg);
+    k.cb = GPU_Direct_Quantize(s.cb);
+    k.light = GPU_Direct_Quantize(s.light, 1024.0f);
+    k.atten = GPU_Direct_Quantize(s.atten, 1024.0f);
+    k.anglescale = GPU_Direct_Quantize(s.anglescale, 1024.0f);
+    k.falloff = GPU_Direct_Quantize(s.falloff, 1024.0f);
+    return k;
+}
+
+static bool GPU_Direct_SourceKeyEquals(const gpu_direct_source_key_t &a, const gpu_direct_source_key_t &b)
+{
+    return a.type == b.type && a.formula == b.formula && a.flags == b.flags &&
+        a.px == b.px && a.py == b.py && a.pz == b.pz &&
+        a.dx == b.dx && a.dy == b.dy && a.dz == b.dz &&
+        a.cr == b.cr && a.cg == b.cg && a.cb == b.cb &&
+        a.light == b.light && a.atten == b.atten &&
+        a.anglescale == b.anglescale && a.falloff == b.falloff;
+}
+
+static void GPU_Direct_AddUniqueSource(
+    std::vector<gpu_direct_source_key_t> &keys,
+    const gpu_light::direct_phase_source_t &src)
+{
+    const auto key = GPU_Direct_SourceKey(src);
+    for (const auto &existing : keys) {
+        if (GPU_Direct_SourceKeyEquals(existing, key)) {
+            return;
+        }
+    }
+    keys.push_back(key);
+    g_gpu_direct_sources.push_back(src);
+}
+
+static float GPU_Direct_EffectivePointRadius(const gpu_light::direct_phase_source_t &src)
+{
+    if (src.type == 1) {
+        return MAX_SKY_DIST;
+    }
+    if (src.formula == 3u) { // LF_INFINITE
+        return MAX_SKY_DIST;
+    }
+    if (src.falloff > 0.0f) {
+        return std::min(src.falloff, static_cast<float>(MAX_SKY_DIST));
+    }
+    // Conservative only for LF_LINEAR/default: value = light - distance * atten.
+    // Inverse formulas are treated as global unless they provide _falloff.
+    if (src.formula == 0u && src.atten > 0.0001f && src.light > 0.0f) {
+        return std::min(src.light / src.atten, static_cast<float>(MAX_SKY_DIST));
+    }
+    return MAX_SKY_DIST;
+}
+
+static float GPU_Direct_PointAABBDistance2(
+    const gpu_light::direct_phase_source_t &src,
+    const qvec3f &mins,
+    const qvec3f &maxs)
+{
+    const float p[3] = {src.px, src.py, src.pz};
+    float d2 = 0.0f;
+    for (int axis = 0; axis < 3; ++axis) {
+        if (p[axis] < mins[axis]) {
+            const float d = mins[axis] - p[axis];
+            d2 += d * d;
+        } else if (p[axis] > maxs[axis]) {
+            const float d = p[axis] - maxs[axis];
+            d2 += d * d;
+        }
+    }
+    return d2;
+}
+
+static bool GPU_Direct_SourceAffectsFace(
+    const gpu_light::direct_phase_source_t &src,
+    const qvec3f &mins,
+    const qvec3f &maxs,
+    const qvec3f &normal,
+    bool twosided)
+{
+    if (twosided) {
+        return true;
+    }
+
+    if (src.type == 1) {
+        const qvec3f dir{src.dx, src.dy, src.dz};
+        return qv::dot(normal, dir) > -0.05f;
+    }
+
+    const float radius = GPU_Direct_EffectivePointRadius(src);
+    if (radius < static_cast<float>(MAX_SKY_DIST) * 0.999f) {
+        const float d2 = GPU_Direct_PointAABBDistance2(src, mins, maxs);
+        if (d2 > radius * radius) {
+            return false;
+        }
+    }
+
+    // Conservative face-normal cull for point lights: use vector from face center to light.
+    const qvec3f center{
+        (mins[0] + maxs[0]) * 0.5f,
+        (mins[1] + maxs[1]) * 0.5f,
+        (mins[2] + maxs[2]) * 0.5f};
+    qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]};
+    const float to_light_len2 = qv::dot(to_light, to_light);
+    if (to_light_len2 > 0.0001f) {
+        to_light = to_light * (1.0f / std::sqrt(to_light_len2));
+        if (qv::dot(normal, to_light) <= -0.10f) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static bool GPU_DirectQueue_BuildSourcesLocked()
 {
     if (g_gpu_direct_sources_built) {
@@ -2782,7 +2920,9 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
     }
     g_gpu_direct_sources_built = true;
     g_gpu_direct_sources.clear();
+    std::vector<gpu_direct_source_key_t> unique_keys;
 
+    std::size_t raw_sources = 0;
     for (const auto &entity_ptr : GetLights()) {
         const light_t *entity = entity_ptr.get();
         if (entity->nostaticlight.value()) continue;
@@ -2813,7 +2953,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
         src.anglescale = entity->anglescale.value();
         src.dirt = entity->dirt.value();
         src.falloff = entity->falloff.value();
-        g_gpu_direct_sources.push_back(src);
+        ++raw_sources;
+        GPU_Direct_AddUniqueSource(unique_keys, src);
     }
 
     for (const sun_t &sun : GetSuns()) {
@@ -2835,20 +2976,33 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
         src.flags = sun.dirt ? 1u : 0u;
         src.anglescale = sun.anglescale;
         src.dirt = sun.dirt ? 1.0f : 0.0f;
-        g_gpu_direct_sources.push_back(src);
+        ++raw_sources;
+        GPU_Direct_AddUniqueSource(unique_keys, src);
     }
 
-    logging::print("GPU direct phase: queued {} compatible direct sources.\n", g_gpu_direct_sources.size());
-    if (g_gpu_direct_sources.empty()) {
-        return true;
-    }
+    logging::print("GPU direct phase: queued {} compatible direct sources ({} raw, {} deduped).\n",
+        g_gpu_direct_sources.size(), raw_sources, raw_sources - g_gpu_direct_sources.size());
     return true;
 }
 
+static std::uint64_t GPU_DirectQueue_ImplicitRayCountLocked()
+{
+    std::uint64_t implicit_rays = 0;
+    for (const auto &sample : g_gpu_direct_samples) {
+        const std::size_t face_index = sample.face_index;
+        if (face_index < g_gpu_direct_face_ranges.size()) {
+            implicit_rays += g_gpu_direct_face_ranges[face_index].source_count;
+        }
+    }
+    return implicit_rays;
+}
+
 static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp)
 {
     if (g_gpu_direct_samples.empty()) {
         g_gpu_direct_faces.clear();
+        g_gpu_direct_face_ranges.clear();
+        g_gpu_direct_face_source_indices.clear();
         return true;
     }
 
@@ -2859,7 +3013,11 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp)
         g_gpu_direct_sources.size(),
         g_gpu_direct_samples.data(),
         accum.data(),
-        g_gpu_direct_samples.size());
+        g_gpu_direct_samples.size(),
+        g_gpu_direct_face_ranges.data(),
+        g_gpu_direct_face_ranges.size(),
+        g_gpu_direct_face_source_indices.data(),
+        g_gpu_direct_face_source_indices.size());
     const auto t1 = std::chrono::steady_clock::now();
     const double gpu_ms = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000.0;
 
@@ -2869,6 +3027,8 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp)
         logging::print("ERROR: disabling GPU direct phase for the rest of this run. Re-run without -gpu for guaranteed CPU output.\n");
         g_gpu_direct_samples.clear();
         g_gpu_direct_faces.clear();
+        g_gpu_direct_face_ranges.clear();
+        g_gpu_direct_face_source_indices.clear();
         return false;
     }
 
@@ -2894,12 +3054,14 @@ static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp)
         }
     }
 
-    const std::uint64_t implicit_rays = static_cast<std::uint64_t>(g_gpu_direct_samples.size()) * static_cast<std::uint64_t>(g_gpu_direct_sources.size());
-    logging::print("GPU direct phase: flushed {} samples x {} sources = {} implicit rays in {:.3f} ms\n",
-        g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), implicit_rays, gpu_ms);
+    const std::uint64_t implicit_rays = GPU_DirectQueue_ImplicitRayCountLocked();
+    logging::print("GPU direct phase: flushed {} samples, {} unique sources, {} face-source refs = {} implicit rays in {:.3f} ms\n",
+        g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), g_gpu_direct_face_source_indices.size(), implicit_rays, gpu_ms);
 
     g_gpu_direct_samples.clear();
     g_gpu_direct_faces.clear();
+    g_gpu_direct_face_ranges.clear();
+    g_gpu_direct_face_source_indices.clear();
     return true;
 }
 } // namespace
@@ -2931,18 +3093,60 @@ static bool GPU_DirectQueue_AddFace(const mbsp_t *bsp, lightsurf_t *lightsurf, l
         return true;
     }
 
+    qvec3f mins{std::numeric_limits<float>::max(), std::numeric_limits<float>::max(), std::numeric_limits<float>::max()};
+    qvec3f maxs{-std::numeric_limits<float>::max(), -std::numeric_limits<float>::max(), -std::numeric_limits<float>::max()};
+    qvec3f normal_sum{0, 0, 0};
+    std::size_t valid_samples = 0;
+    for (const auto &sample : lightsurf->samples) {
+        if (sample.occluded) {
+            continue;
+        }
+        for (int axis = 0; axis < 3; ++axis) {
+            mins[axis] = std::min(mins[axis], sample.point[axis]);
+            maxs[axis] = std::max(maxs[axis], sample.point[axis]);
+        }
+        normal_sum += sample.normal;
+        ++valid_samples;
+    }
+    if (valid_samples == 0) {
+        return true;
+    }
+
+    qvec3f face_normal = lightsurf->snormal;
+    const float normal_len2 = qv::dot(normal_sum, normal_sum);
+    if (normal_len2 > 0.0001f) {
+        face_normal = normal_sum * (1.0f / std::sqrt(normal_len2));
+    }
+
+    const std::uint32_t face_index = static_cast<std::uint32_t>(g_gpu_direct_face_ranges.size());
+    gpu_light::direct_phase_face_range_t face_range{};
+    face_range.source_begin = static_cast<std::uint32_t>(g_gpu_direct_face_source_indices.size());
+
+    for (std::uint32_t source_index = 0; source_index < g_gpu_direct_sources.size(); ++source_index) {
+        if (GPU_Direct_SourceAffectsFace(g_gpu_direct_sources[source_index], mins, maxs, face_normal, lightsurf->twosided)) {
+            g_gpu_direct_face_source_indices.push_back(source_index);
+        }
+    }
+
+    face_range.source_count = static_cast<std::uint32_t>(g_gpu_direct_face_source_indices.size()) - face_range.source_begin;
+    if (face_range.source_count == 0) {
+        return true;
+    }
+    g_gpu_direct_face_ranges.push_back(face_range);
+
     const std::size_t first_sample = g_gpu_direct_samples.size();
     g_gpu_direct_faces.push_back(gpu_direct_face_record_t{lightsurf, lightmaps, first_sample, sample_count});
 
     for (const auto &sample : lightsurf->samples) {
         gpu_light::direct_phase_sample_t s{};
+        s.face_index = face_index;
         if (!sample.occluded) {
             s.px = sample.point[0]; s.py = sample.point[1]; s.pz = sample.point[2];
             s.nx = sample.normal[0]; s.ny = sample.normal[1]; s.nz = sample.normal[2];
             s.occlusion = sample.occlusion;
             s.twosided = lightsurf->twosided ? 1.0f : 0.0f;
         } else {
-            s.twosided = -1.0f; // sentinel: shader skips occluded/invalid samples
+            s.twosided = -1.0f;
         }
         g_gpu_direct_samples.push_back(s);
     }
diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc
index 71a6e8e8..1f5a1732 100644
--- a/light/trace_gpu.cc
+++ b/light/trace_gpu.cc
@@ -22,6 +22,10 @@ bool trace_direct_phase_batch(
     const gpu_light::direct_phase_sample_t *samples,
     gpu_light::direct_phase_accum_t *accum,
     std::size_t sample_count,
+    const gpu_light::direct_phase_face_range_t *face_ranges,
+    std::size_t face_range_count,
+    const std::uint32_t *face_source_indices,
+    std::size_t face_source_index_count,
     std::string &error);
 
 bool trace_direct_accumulate_batch(
@@ -145,12 +149,23 @@ bool trace_direct_phase_batch(
     std::size_t source_count,
     const direct_phase_sample_t *samples,
     direct_phase_accum_t *accum,
-    std::size_t sample_count) {
-    if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) {
+    std::size_t sample_count,
+    const direct_phase_face_range_t *face_ranges,
+    std::size_t face_range_count,
+    const std::uint32_t *face_source_indices,
+    std::size_t face_source_index_count) {
+    if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0) {
         return true;
     }
 
-    const std::uint64_t implicit_rays = static_cast<std::uint64_t>(source_count) * static_cast<std::uint64_t>(sample_count);
+    std::uint64_t implicit_rays = 0;
+    for (std::size_t i = 0; i < face_range_count; ++i) {
+        implicit_rays += face_ranges[i].source_count;
+    }
+    if (implicit_rays == 0 || face_source_index_count == 0) {
+        return true;
+    }
+    implicit_rays *= static_cast<std::uint64_t>(sample_count) / static_cast<std::uint64_t>(face_range_count);
     {
         std::lock_guard<std::mutex> lock(g_mutex);
         g_stats.batches++;
@@ -164,7 +179,8 @@ bool trace_direct_phase_batch(
 #if defined(HAVE_GPU_LIGHT)
     std::string error;
     const bool ok = vulkan_backend::trace_direct_phase_batch(
-        sources, source_count, samples, accum, sample_count, error);
+        sources, source_count, samples, accum, sample_count,
+        face_ranges, face_range_count, face_source_indices, face_source_index_count, error);
     std::lock_guard<std::mutex> lock(g_mutex);
     if (ok) {
         g_stats.gpu_batches++;
diff --git a/light/trace_gpu_vulkan.cc b/light/trace_gpu_vulkan.cc
index eda4d928..da78638e 100644
--- a/light/trace_gpu_vulkan.cc
+++ b/light/trace_gpu_vulkan.cc
@@ -90,6 +90,15 @@ struct gpu_direct_accum_host_t {
 struct gpu_direct_phase_sample_host_t {
     float px, py, pz, occlusion;
     float nx, ny, nz, twosided;
+    std::uint32_t face_index;
+    std::uint32_t reserved0;
+    std::uint32_t reserved1;
+    std::uint32_t reserved2;
+};
+
+struct gpu_direct_phase_face_range_host_t {
+    std::uint32_t source_begin;
+    std::uint32_t source_count;
 };
 
 struct gpu_direct_phase_source_host_t {
@@ -122,7 +131,8 @@ static_assert(sizeof(gpu_result_host_t) == 20, "GPU result layout must match sha
 static_assert(sizeof(gpu_direct_job_host_t) == 80, "GPU direct job layout must match shader");
 static_assert(sizeof(gpu_direct_range_host_t) == 8, "GPU direct range layout must match shader");
 static_assert(sizeof(gpu_direct_accum_host_t) == 48, "GPU direct accum layout must match shader");
-static_assert(sizeof(gpu_direct_phase_sample_host_t) == 32, "GPU direct phase sample layout must match shader");
+static_assert(sizeof(gpu_direct_phase_sample_host_t) == 48, "GPU direct phase sample layout must match shader");
+static_assert(sizeof(gpu_direct_phase_face_range_host_t) == 8, "GPU direct phase face range layout must match shader");
 static_assert(sizeof(gpu_direct_phase_source_host_t) == 80, "GPU direct phase source layout must match shader");
 
 struct context_t {
@@ -793,7 +803,19 @@ static bool create_direct_pipeline(std::string &error) {
     b3.descriptorCount = 1;
     b3.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
 
-    std::array<VkDescriptorSetLayoutBinding, 4> bindings{b0, b1, b2, b3};
+    VkDescriptorSetLayoutBinding b4{};
+    b4.binding = 4;
+    b4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b4.descriptorCount = 1;
+    b4.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorSetLayoutBinding b5{};
+    b5.binding = 5;
+    b5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    b5.descriptorCount = 1;
+    b5.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    std::array<VkDescriptorSetLayoutBinding, 6> bindings{b0, b1, b2, b3, b4, b5};
     VkDescriptorSetLayoutCreateInfo dlci{};
     dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
     dlci.bindingCount = static_cast<std::uint32_t>(bindings.size());
@@ -840,7 +862,7 @@ static bool create_direct_pipeline(std::string &error) {
     ps0.descriptorCount = 1;
     VkDescriptorPoolSize ps1{};
     ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    ps1.descriptorCount = 3;
+    ps1.descriptorCount = 5;
     std::array<VkDescriptorPoolSize, 2> sizes{ps0, ps1};
 
     VkDescriptorPoolCreateInfo dpci{};
@@ -860,7 +882,12 @@ static bool create_direct_pipeline(std::string &error) {
     return true;
 }
 
-static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffer_t &range_buffer, const buffer_t &accum_buffer) {
+static void update_direct_descriptor_set(
+    const buffer_t &sample_buffer,
+    const buffer_t &source_buffer,
+    const buffer_t &face_range_buffer,
+    const buffer_t &face_source_index_buffer,
+    const buffer_t &accum_buffer) {
     VkWriteDescriptorSetAccelerationStructureKHR as_info{};
     as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR;
     as_info.accelerationStructureCount = 1;
@@ -874,43 +901,67 @@ static void update_direct_descriptor_set(const buffer_t &job_buffer, const buffe
     w0.descriptorCount = 1;
     w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR;
 
-    VkDescriptorBufferInfo job_info{};
-    job_info.buffer = job_buffer.buffer;
-    job_info.offset = 0;
-    job_info.range = job_buffer.size;
+    VkDescriptorBufferInfo sample_info{};
+    sample_info.buffer = sample_buffer.buffer;
+    sample_info.offset = 0;
+    sample_info.range = sample_buffer.size;
     VkWriteDescriptorSet w1{};
     w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     w1.dstSet = g.direct_descriptor_set;
     w1.dstBinding = 1;
     w1.descriptorCount = 1;
     w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    w1.pBufferInfo = &job_info;
+    w1.pBufferInfo = &sample_info;
 
-    VkDescriptorBufferInfo range_info{};
-    range_info.buffer = range_buffer.buffer;
-    range_info.offset = 0;
-    range_info.range = range_buffer.size;
+    VkDescriptorBufferInfo source_info{};
+    source_info.buffer = source_buffer.buffer;
+    source_info.offset = 0;
+    source_info.range = source_buffer.size;
     VkWriteDescriptorSet w2{};
     w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     w2.dstSet = g.direct_descriptor_set;
     w2.dstBinding = 2;
     w2.descriptorCount = 1;
     w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    w2.pBufferInfo = &range_info;
+    w2.pBufferInfo = &source_info;
 
-    VkDescriptorBufferInfo accum_info{};
-    accum_info.buffer = accum_buffer.buffer;
-    accum_info.offset = 0;
-    accum_info.range = accum_buffer.size;
+    VkDescriptorBufferInfo face_range_info{};
+    face_range_info.buffer = face_range_buffer.buffer;
+    face_range_info.offset = 0;
+    face_range_info.range = face_range_buffer.size;
     VkWriteDescriptorSet w3{};
     w3.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
     w3.dstSet = g.direct_descriptor_set;
     w3.dstBinding = 3;
     w3.descriptorCount = 1;
     w3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-    w3.pBufferInfo = &accum_info;
+    w3.pBufferInfo = &face_range_info;
+
+    VkDescriptorBufferInfo face_source_index_info{};
+    face_source_index_info.buffer = face_source_index_buffer.buffer;
+    face_source_index_info.offset = 0;
+    face_source_index_info.range = face_source_index_buffer.size;
+    VkWriteDescriptorSet w4{};
+    w4.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w4.dstSet = g.direct_descriptor_set;
+    w4.dstBinding = 4;
+    w4.descriptorCount = 1;
+    w4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w4.pBufferInfo = &face_source_index_info;
 
-    std::array<VkWriteDescriptorSet, 4> writes{w0, w1, w2, w3};
+    VkDescriptorBufferInfo accum_info{};
+    accum_info.buffer = accum_buffer.buffer;
+    accum_info.offset = 0;
+    accum_info.range = accum_buffer.size;
+    VkWriteDescriptorSet w5{};
+    w5.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    w5.dstSet = g.direct_descriptor_set;
+    w5.dstBinding = 5;
+    w5.descriptorCount = 1;
+    w5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    w5.pBufferInfo = &accum_info;
+
+    std::array<VkWriteDescriptorSet, 6> writes{w0, w1, w2, w3, w4, w5};
     vkUpdateDescriptorSets(g.device, static_cast<std::uint32_t>(writes.size()), writes.data(), 0, nullptr);
 }
 
@@ -1089,6 +1140,10 @@ bool trace_direct_phase_batch(
     const gpu_light::direct_phase_sample_t *samples,
     gpu_light::direct_phase_accum_t *accum,
     std::size_t sample_count,
+    const gpu_light::direct_phase_face_range_t *face_ranges,
+    std::size_t face_range_count,
+    const std::uint32_t *face_source_indices,
+    std::size_t face_source_index_count,
     std::string &error) {
     std::lock_guard<std::mutex> lock(g_mutex);
     if (!g.device || !g.direct_pipeline || !g.tlas.as) {
@@ -1098,7 +1153,7 @@ bool trace_direct_phase_batch(
     if (g.has_filtered_embree_geometry) {
         return false;
     }
-    if (!sources || !samples || !accum || source_count == 0 || sample_count == 0) {
+    if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0 || face_source_index_count == 0) {
         return true;
     }
 
@@ -1112,6 +1167,10 @@ bool trace_direct_phase_batch(
         gpu_samples[i].ny = samples[i].ny;
         gpu_samples[i].nz = samples[i].nz;
         gpu_samples[i].twosided = samples[i].twosided;
+        gpu_samples[i].face_index = samples[i].face_index;
+        gpu_samples[i].reserved0 = 0;
+        gpu_samples[i].reserved1 = 0;
+        gpu_samples[i].reserved2 = 0;
     }
 
     std::vector<gpu_direct_phase_source_host_t> gpu_sources(source_count);
@@ -1138,10 +1197,21 @@ bool trace_direct_phase_batch(
         gpu_sources[i].pad0 = 0.0f;
     }
 
+    std::vector<gpu_direct_phase_face_range_host_t> gpu_face_ranges(face_range_count);
+    for (std::size_t i = 0; i < face_range_count; ++i) {
+        gpu_face_ranges[i].source_begin = face_ranges[i].source_begin;
+        gpu_face_ranges[i].source_count = face_ranges[i].source_count;
+    }
+
+    std::vector<std::uint32_t> gpu_face_source_indices(face_source_index_count);
+    std::memcpy(gpu_face_source_indices.data(), face_source_indices, sizeof(std::uint32_t) * face_source_index_count);
+
     std::vector<gpu_direct_accum_host_t> zero_accum(sample_count);
 
     buffer_t sample_buffer;
     buffer_t source_buffer;
+    buffer_t face_range_buffer;
+    buffer_t face_source_index_buffer;
     buffer_t accum_buffer;
 
     bool ok = create_buffer(sizeof(gpu_direct_phase_sample_host_t) * sample_count,
@@ -1163,6 +1233,31 @@ bool trace_direct_phase_batch(
         return false;
     }
 
+    ok = create_buffer(sizeof(gpu_direct_phase_face_range_host_t) * face_range_count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        face_range_buffer,
+        error,
+        gpu_face_ranges.data());
+    if (!ok) {
+        destroy_buffer(source_buffer);
+        destroy_buffer(sample_buffer);
+        return false;
+    }
+
+    ok = create_buffer(sizeof(std::uint32_t) * face_source_index_count,
+        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        face_source_index_buffer,
+        error,
+        gpu_face_source_indices.data());
+    if (!ok) {
+        destroy_buffer(face_range_buffer);
+        destroy_buffer(source_buffer);
+        destroy_buffer(sample_buffer);
+        return false;
+    }
+
     ok = create_buffer(sizeof(gpu_direct_accum_host_t) * sample_count,
         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
@@ -1170,12 +1265,14 @@ bool trace_direct_phase_batch(
         error,
         zero_accum.data());
     if (!ok) {
+        destroy_buffer(face_source_index_buffer);
+        destroy_buffer(face_range_buffer);
         destroy_buffer(source_buffer);
         destroy_buffer(sample_buffer);
         return false;
     }
 
-    update_direct_descriptor_set(sample_buffer, source_buffer, accum_buffer);
+    update_direct_descriptor_set(sample_buffer, source_buffer, face_range_buffer, face_source_index_buffer, accum_buffer);
 
     direct_push_constants_t pc{};
     pc.sample_count = static_cast<std::uint32_t>(sample_count);
@@ -1214,6 +1311,8 @@ bool trace_direct_phase_batch(
     }
 
     destroy_buffer(accum_buffer);
+    destroy_buffer(face_source_index_buffer);
+    destroy_buffer(face_range_buffer);
     destroy_buffer(source_buffer);
     destroy_buffer(sample_buffer);
     return ok;

From 0d9f25ed79ba375a6e8c85cddeb8c0ad52bcdfde Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 04:19:25 +0900
Subject: [PATCH 4/7] whitespace

---
 light/ltface.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/light/ltface.cc b/light/ltface.cc
index baddf249..27fd406d 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -2751,8 +2751,6 @@ static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, light
 
 
 
-
-
 #if defined(HAVE_GPU_LIGHT)
 namespace {
 struct gpu_direct_face_record_t {

From 4e1387f6c06437c356cd55305c1b141e548e9ade Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 04:34:45 +0900
Subject: [PATCH 5/7] Remove unused GPU light processing code from ltface.cc
 and adjust shader ray direction normalization in direct_phase.comp for
 improved performance.

---
 light/gpu_shaders/direct_phase.comp |   2 +-
 light/ltface.cc                     | 267 ++++++----------------------
 2 files changed, 60 insertions(+), 209 deletions(-)

diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp
index 43dd6b51..79b7e167 100644
--- a/light/gpu_shaders/direct_phase.comp
+++ b/light/gpu_shaders/direct_phase.comp
@@ -149,7 +149,7 @@ void main() {
         vec3 ncontrib_dir;
 
         if (l.type == 1u) {
-            ray_dir = normalize(l.direction);
+            ray_dir = l.direction;
             ray_dist = l.dist;
             float angle = dot(ray_dir, s.normal);
             if (s.twosided > 0.5 && angle < 0.0) angle = -angle;
diff --git a/light/ltface.cc b/light/ltface.cc
index 27fd406d..fa3401e7 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -2563,194 +2563,6 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const
 
 
 
-#if defined(HAVE_GPU_LIGHT)
-static bool LightFace_DirectGPU(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps)
-{
-
-    return false;
-
-    if (!GPU_TraceAvailable()) {
-        return false;
-    }
-
-    constexpr std::size_t GPU_DIRECT_MIN_JOBS = 32768;
-
-    const settings::worldspawn_keys &cfg = *lightsurf->cfg;
-    const modelinfo_t *modelinfo = lightsurf->modelinfo;
-    const qplane3f &plane = lightsurf->plane;
-    const std::size_t sample_count = lightsurf->samples.size();
-    if (!sample_count) {
-        return true;
-    }
-
-    std::vector<std::vector<gpu_light::direct_job_t>> per_sample(sample_count);
-
-    auto add_job = [&](int sample_index, const qvec3f &origin, const qvec3f &direction, float dist, const qvec3f &color, const qvec3f &normalcontrib) {
-        gpu_light::direct_job_t job{};
-        job.ox = origin[0];
-        job.oy = origin[1];
-        job.oz = origin[2];
-        job.tmin = 0.01f;
-        job.dx = direction[0];
-        job.dy = direction[1];
-        job.dz = direction[2];
-        job.tmax = dist;
-        job.cr = color[0];
-        job.cg = color[1];
-        job.cb = color[2];
-        job.nr = normalcontrib[0];
-        job.ng = normalcontrib[1];
-        job.nb = normalcontrib[2];
-        job.sample_index = static_cast<std::uint32_t>(sample_index);
-        per_sample[static_cast<std::size_t>(sample_index)].push_back(job);
-    };
-
-    // Entity lights.  This fast path is deliberately style-0/default-channel only.
-    for (const auto &entity_ptr : GetLights()) {
-        const light_t *entity = entity_ptr.get();
-        if (entity->getFormula() == LF_LOCALMIN) continue;
-        if (entity->nostaticlight.value()) continue;
-        if (entity->light.value() <= 0) continue;
-
-        if (entity->style.value() != 0) return false;
-        if (entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false;
-        if (entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT) return false;
-
-        if (light_options.visapprox.value() == visapprox_t::VIS &&
-            entity->light_channel_mask.value() == CHANNEL_MASK_DEFAULT &&
-            entity->shadow_channel_mask.value() == CHANNEL_MASK_DEFAULT &&
-            VisCullEntity(bsp, lightsurf->pvs, entity->leaf)) {
-            continue;
-        }
-
-        const float planedist = plane.distance_to(entity->origin.value());
-        if (planedist < 0 && !entity->bleed.value() && !lightsurf->curved && !lightsurf->twosided) {
-            continue;
-        }
-        if (CullLight(entity, lightsurf)) {
-            continue;
-        }
-        if (!(entity->light_channel_mask.value() & lightsurf->object_channel_mask)) {
-            continue;
-        }
-
-        for (int i = 0; i < static_cast<int>(lightsurf->samples.size()); i++) {
-            const auto &sample = lightsurf->samples[i];
-            if (sample.occluded) continue;
-
-            const qvec3f &surfpoint = sample.point;
-            const qvec3f &surfnorm = sample.normal;
-            qvec3f surfpointToLightDir;
-            float surfpointToLightDist;
-            qvec3f color;
-            qvec3f normalcontrib;
-            GetLightContrib(cfg, entity, surfnorm, true, surfpoint, lightsurf->twosided, color, surfpointToLightDir, normalcontrib, &surfpointToLightDist);
-            const float occlusion = Dirt_GetScaleFactor(cfg, sample.occlusion, entity, surfpointToLightDist, lightsurf);
-            color *= occlusion;
-            if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) {
-                continue;
-            }
-            add_job(i, surfpoint, surfpointToLightDir, surfpointToLightDist, color, normalcontrib);
-        }
-    }
-
-    // Sunlight.  The GPU AS contains opaque solids only, so a miss is treated as visible sky.
-    // Sun texture filtering and non-zero styles stay on the CPU path.
-    for (const sun_t &sun : GetSuns()) {
-        if (sun.sunlight <= 0) continue;
-        if (sun.style != 0) return false;
-        if (sun.suntexture_value) return false;
-
-        qvec3f incoming = qv::normalize(sun.sunvec);
-        const float dp = qv::dot(incoming, plane.normal);
-        if (dp < -LIGHT_ANGLE_EPSILON && !lightsurf->curved && !lightsurf->twosided) {
-            continue;
-        }
-        if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) {
-            continue;
-        }
-
-        for (int i = 0; i < static_cast<int>(lightsurf->samples.size()); i++) {
-            const auto &sample = lightsurf->samples[i];
-            if (sample.occluded) continue;
-
-            const qvec3f &surfpoint = sample.point;
-            const qvec3f &surfnorm = sample.normal;
-            float angle = qv::dot(incoming, surfnorm);
-            if (lightsurf->twosided && angle < 0) {
-                angle = -angle;
-            }
-            angle = std::max(0.0f, angle);
-            angle = (1.0f - sun.anglescale) + sun.anglescale * angle;
-            float value = angle * sun.sunlight;
-            if (sun.dirt) {
-                value *= Dirt_GetScaleFactor(cfg, sample.occlusion, NULL, 0.0f, lightsurf);
-            }
-            qvec3f color = sun.sunlight_color * (value / 255.0f);
-            if (fabs(LightSample_Brightness(color)) <= light_options.gate.value()) {
-                continue;
-            }
-            qvec3f normalcontrib = incoming * value;
-            add_job(i, surfpoint, incoming, MAX_SKY_DIST, color, normalcontrib);
-        }
-    }
-
-    std::size_t job_count = 0;
-    for (const auto &v : per_sample) {
-        job_count += v.size();
-    }
-    if (job_count == 0) {
-        return true;
-    }
-    if (job_count < GPU_DIRECT_MIN_JOBS) {
-        return false;
-    }
-
-    std::vector<gpu_light::direct_job_t> jobs;
-    std::vector<gpu_light::direct_sample_range_t> ranges(sample_count);
-    jobs.reserve(job_count);
-    for (std::size_t i = 0; i < sample_count; ++i) {
-        ranges[i].first = static_cast<std::uint32_t>(jobs.size());
-        ranges[i].count = static_cast<std::uint32_t>(per_sample[i].size());
-        jobs.insert(jobs.end(), per_sample[i].begin(), per_sample[i].end());
-    }
-
-    std::vector<gpu_light::direct_accum_t> accum(sample_count);
-    if (!gpu_light::trace_direct_accumulate_batch(
-            modelinfo,
-            CHANNEL_MASK_DEFAULT,
-            jobs.data(),
-            jobs.size(),
-            ranges.data(),
-            accum.data(),
-            sample_count)) {
-        return false;
-    }
-
-    lightmap_t *lightmap = Lightmap_ForStyle(lightmaps, 0, lightsurf);
-    bool hit = false;
-    for (std::size_t i = 0; i < sample_count; ++i) {
-        if (!accum[i].hit) continue;
-        const qvec3f color{accum[i].cr, accum[i].cg, accum[i].cb};
-        const qvec3f normalcontrib{accum[i].nr, accum[i].ng, accum[i].nb};
-        lightsample_t &sample = lightmap->samples[i];
-        sample.color += color;
-        sample.direction += normalcontrib;
-        lightmap->bounce_color += color;
-        hit = true;
-    }
-    if (hit) {
-        Lightmap_Save(bsp, lightmaps, lightsurf, lightmap, 0);
-    }
-    return true;
-}
-#endif
-
-
-
-
-
-
 #if defined(HAVE_GPU_LIGHT)
 namespace {
 struct gpu_direct_face_record_t {
@@ -2777,10 +2589,14 @@ struct gpu_direct_source_key_t {
     std::uint32_t flags = 0;
     int px = 0, py = 0, pz = 0;
     int dx = 0, dy = 0, dz = 0;
-    int cr = 0, cg = 0, cb = 0;
-    int light = 0, atten = 0, anglescale = 0, falloff = 0;
+    int atten = 0, anglescale = 0, falloff = 0;
 };
 
+// lower values merge more nearby sun rays into one representative ray.
+// This preserves approximate energy by accumulating light/color into the merged source.
+// Raise to 64/128 for quality, lower to 16/8 for speed.
+static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 32.0f;
+
 static int GPU_Direct_Quantize(float v, float scale = 4096.0f)
 {
     return static_cast<int>(std::lround(v * scale));
@@ -2795,13 +2611,10 @@ static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phas
     k.px = GPU_Direct_Quantize(s.px);
     k.py = GPU_Direct_Quantize(s.py);
     k.pz = GPU_Direct_Quantize(s.pz);
-    k.dx = GPU_Direct_Quantize(s.dx);
-    k.dy = GPU_Direct_Quantize(s.dy);
-    k.dz = GPU_Direct_Quantize(s.dz);
-    k.cr = GPU_Direct_Quantize(s.cr);
-    k.cg = GPU_Direct_Quantize(s.cg);
-    k.cb = GPU_Direct_Quantize(s.cb);
-    k.light = GPU_Direct_Quantize(s.light, 1024.0f);
+    const float dir_scale = (s.type == 1u) ? GPU_DIRECT_SUN_DIR_MERGE_SCALE : 4096.0f;
+    k.dx = GPU_Direct_Quantize(s.dx, dir_scale);
+    k.dy = GPU_Direct_Quantize(s.dy, dir_scale);
+    k.dz = GPU_Direct_Quantize(s.dz, dir_scale);
     k.atten = GPU_Direct_Quantize(s.atten, 1024.0f);
     k.anglescale = GPU_Direct_Quantize(s.anglescale, 1024.0f);
     k.falloff = GPU_Direct_Quantize(s.falloff, 1024.0f);
@@ -2813,18 +2626,44 @@ static bool GPU_Direct_SourceKeyEquals(const gpu_direct_source_key_t &a, const g
     return a.type == b.type && a.formula == b.formula && a.flags == b.flags &&
         a.px == b.px && a.py == b.py && a.pz == b.pz &&
         a.dx == b.dx && a.dy == b.dy && a.dz == b.dz &&
-        a.cr == b.cr && a.cg == b.cg && a.cb == b.cb &&
-        a.light == b.light && a.atten == b.atten &&
-        a.anglescale == b.anglescale && a.falloff == b.falloff;
+        a.atten == b.atten && a.anglescale == b.anglescale && a.falloff == b.falloff;
 }
 
-static void GPU_Direct_AddUniqueSource(
+static void GPU_Direct_MergeInto(gpu_light::direct_phase_source_t &dst, const gpu_light::direct_phase_source_t &src)
+{
+    const float a = std::max(dst.light, 0.0f);
+    const float b = std::max(src.light, 0.0f);
+    const float total = a + b;
+    if (total <= 0.0f) {
+        return;
+    }
+
+    dst.cr = (dst.cr * a + src.cr * b) / total;
+    dst.cg = (dst.cg * a + src.cg * b) / total;
+    dst.cb = (dst.cb * a + src.cb * b) / total;
+
+    if (dst.type == 1u) {
+        qvec3f d{dst.dx * a + src.dx * b, dst.dy * a + src.dy * b, dst.dz * a + src.dz * b};
+        const float len2 = qv::dot(d, d);
+        if (len2 > 0.0001f) {
+            d = d * (1.0f / std::sqrt(len2));
+            dst.dx = d[0];
+            dst.dy = d[1];
+            dst.dz = d[2];
+        }
+    }
+
+    dst.light = total;
+}
+
+static void GPU_Direct_AddMergedSource(
     std::vector<gpu_direct_source_key_t> &keys,
     const gpu_light::direct_phase_source_t &src)
 {
     const auto key = GPU_Direct_SourceKey(src);
-    for (const auto &existing : keys) {
-        if (GPU_Direct_SourceKeyEquals(existing, key)) {
+    for (std::size_t i = 0; i < keys.size(); ++i) {
+        if (GPU_Direct_SourceKeyEquals(keys[i], key)) {
+            GPU_Direct_MergeInto(g_gpu_direct_sources[i], src);
             return;
         }
     }
@@ -2883,7 +2722,7 @@ static bool GPU_Direct_SourceAffectsFace(
 
     if (src.type == 1) {
         const qvec3f dir{src.dx, src.dy, src.dz};
-        return qv::dot(normal, dir) > -0.05f;
+        return qv::dot(normal, dir) > -0.01f;
     }
 
     const float radius = GPU_Direct_EffectivePointRadius(src);
@@ -2921,6 +2760,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
     std::vector<gpu_direct_source_key_t> unique_keys;
 
     std::size_t raw_sources = 0;
+    std::size_t raw_point_sources = 0;
+    std::size_t raw_sun_sources = 0;
     for (const auto &entity_ptr : GetLights()) {
         const light_t *entity = entity_ptr.get();
         if (entity->nostaticlight.value()) continue;
@@ -2952,7 +2793,8 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
         src.dirt = entity->dirt.value();
         src.falloff = entity->falloff.value();
         ++raw_sources;
-        GPU_Direct_AddUniqueSource(unique_keys, src);
+        ++raw_point_sources;
+        GPU_Direct_AddMergedSource(unique_keys, src);
     }
 
     for (const sun_t &sun : GetSuns()) {
@@ -2975,11 +2817,20 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
         src.anglescale = sun.anglescale;
         src.dirt = sun.dirt ? 1.0f : 0.0f;
         ++raw_sources;
-        GPU_Direct_AddUniqueSource(unique_keys, src);
+        ++raw_sun_sources;
+        GPU_Direct_AddMergedSource(unique_keys, src);
+    }
+
+    std::size_t merged_point_sources = 0;
+    std::size_t merged_sun_sources = 0;
+    for (const auto &src : g_gpu_direct_sources) {
+        if (src.type == 1u) ++merged_sun_sources;
+        else ++merged_point_sources;
     }
 
-    logging::print("GPU direct phase: queued {} compatible direct sources ({} raw, {} deduped).\n",
-        g_gpu_direct_sources.size(), raw_sources, raw_sources - g_gpu_direct_sources.size());
+    logging::print("GPU direct phase: queued {} merged direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge scale {}).\n",
+        g_gpu_direct_sources.size(), raw_sources, raw_point_sources, raw_sun_sources,
+        merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), GPU_DIRECT_SUN_DIR_MERGE_SCALE);
     return true;
 }
 

From 5a3885e9ac5bccd57b52b980a0305ee1f33487fe Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 04:37:55 +0900
Subject: [PATCH 6/7] Update GPU_DIRECT_SUN_DIR_MERGE_SCALE for improved
 quality in light processing

---
 light/ltface.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/light/ltface.cc b/light/ltface.cc
index fa3401e7..31f87183 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -2594,8 +2594,8 @@ struct gpu_direct_source_key_t {
 
 // lower values merge more nearby sun rays into one representative ray.
 // This preserves approximate energy by accumulating light/color into the merged source.
-// Raise to 64/128 for quality, lower to 16/8 for speed.
-static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 32.0f;
+// Raise to 512~4096 for quality, lower to 16/8 for speed.
+static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 512.0f;
 
 static int GPU_Direct_Quantize(float v, float scale = 4096.0f)
 {

From 5091dc811bb30cfe964478e2c0d3360f7f06517c Mon Sep 17 00:00:00 2001
From: y4my4my4m <8145020+y4my4my4m@users.noreply.github.com>
Date: Sat, 20 Jun 2026 11:03:09 +0900
Subject: [PATCH 7/7] Add GPU sun merging and source culling options

- Introduced `gpusunmerge` and `gpusunmergequality` settings for approximate merging of nearby GPU sun jitter rays.
- Added `gpusourcecull` and `gpusourcecullquality` settings to enable and control approximate GPU per-face source culling.
- Updated light processing logic to utilize new settings for improved performance and quality in light rendering.
---
 include/light/light.hh |   4 ++
 light/light.cc         |   7 ++-
 light/ltface.cc        | 100 ++++++++++++++++++++++++++++++++---------
 3 files changed, 90 insertions(+), 21 deletions(-)

diff --git a/include/light/light.hh b/include/light/light.hh
index 4b2852a6..177cf5f6 100644
--- a/include/light/light.hh
+++ b/include/light/light.hh
@@ -396,6 +396,10 @@ public:
     setting_bool novanilla;
     setting_scalar gate;
     setting_int32 sunsamples;
+    setting_bool gpusunmerge; // -gpusunmerge: approximate-merge nearby GPU sun jitter rays
+    setting_scalar gpusunmergequality; // -gpusunmergequality: 0 fast/rough, 1 slow/high quality
+    setting_bool gpusourcecull; // -gpusourcecull: use approximate GPU per-face source culling
+    setting_scalar gpusourcecullquality; // -gpusourcecullquality: 0 fast/aggressive, 1 safest/conservative
     settings::setting_bool gpu; // -gpu: use Vulkan GPU ray-query backend when available
     setting_bool arghradcompat;
     setting_bool nolighting;
diff --git a/light/light.cc b/light/light.cc
index 55ff4709..b20e4d3d 100644
--- a/light/light.cc
+++ b/light/light.cc
@@ -292,7 +292,12 @@ light_settings::light_settings()
       write_normals{this, "wrnormals", false, &output_group, "output normals, tangents and bitangents in a BSPX lump"},
       novanilla{this, "novanilla", false, &experimental_group, "implies -bspxlit; don't write vanilla lighting"},
       gate{this, "gate", LIGHT_EQUAL_EPSILON, &performance_group, "cutoff lights at this brightness level"},
-      sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"},
+      sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"},
+      gpusunmerge{this, "gpusunmerge", false, &performance_group, "approximate-merge nearby GPU sun jitter rays"},
+      gpusunmergequality{this, "gpusunmergequality", 0.75f, 0.0f, 1.0f, &performance_group, "GPU sun merge quality: 0 fast/rough, 1 slow/high quality"},
+      gpusourcecull{this, "gpusourcecull", false, &performance_group, "use approximate GPU per-face source culling"},
+      gpusourcecullquality{this, "gpusourcecullquality", 1.0f, 0.0f, 1.0f, &performance_group, "GPU source culling quality: 0 fast/aggressive, 1 safest/conservative"},
+      gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"},
       arghradcompat{this, "arghradcompat", false, &output_group, "enable compatibility for Arghrad-specific keys"},
       nolighting{this, "nolighting", false, &output_group, "don't output main world lighting (Q2RTX)"},
       debugface{this, "debugface", std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
diff --git a/light/ltface.cc b/light/ltface.cc
index 31f87183..649312ed 100644
--- a/light/ltface.cc
+++ b/light/ltface.cc
@@ -2592,10 +2592,45 @@ struct gpu_direct_source_key_t {
     int atten = 0, anglescale = 0, falloff = 0;
 };
 
-// lower values merge more nearby sun rays into one representative ray.
-// This preserves approximate energy by accumulating light/color into the merged source.
-// Raise to 512~4096 for quality, lower to 16/8 for speed.
-static constexpr float GPU_DIRECT_SUN_DIR_MERGE_SCALE = 512.0f;
+// Optional approximate sun-direction merge. Disabled by default for final quality.
+// When enabled with -gpusunmerge, -gpusunmergequality maps to a direction quantization scale:
+//   0.00 => 16   fastest/roughest
+//   0.50 => 256  balanced preview
+//   1.00 => 4096 best quality/least merging
+static float GPU_Direct_SunMergeQuality()
+{
+    float q = light_options.gpusunmergequality.value();
+    if (!std::isfinite(q)) {
+        q = 0.75f;
+    }
+    if (q < 0.0f) q = 0.0f;
+    if (q > 1.0f) q = 1.0f;
+    return q;
+}
+
+static float GPU_Direct_SunMergeScale()
+{
+    if (!light_options.gpusunmerge.value()) {
+        return 65536.0f; // effectively exact; preserves final-quality sun jitter
+    }
+    return 16.0f * std::pow(256.0f, GPU_Direct_SunMergeQuality());
+}
+
+static float GPU_Direct_SourceCullQuality()
+{
+    float q = light_options.gpusourcecullquality.value();
+    if (!std::isfinite(q)) {
+        q = 1.0f;
+    }
+    if (q < 0.0f) q = 0.0f;
+    if (q > 1.0f) q = 1.0f;
+    return q;
+}
+
+static bool GPU_Direct_SourceCullEnabled()
+{
+    return light_options.gpusourcecull.value();
+}
 
 static int GPU_Direct_Quantize(float v, float scale = 4096.0f)
 {
@@ -2611,7 +2646,7 @@ static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phas
     k.px = GPU_Direct_Quantize(s.px);
     k.py = GPU_Direct_Quantize(s.py);
     k.pz = GPU_Direct_Quantize(s.pz);
-    const float dir_scale = (s.type == 1u) ? GPU_DIRECT_SUN_DIR_MERGE_SCALE : 4096.0f;
+    const float dir_scale = (s.type == 1u) ? GPU_Direct_SunMergeScale() : 4096.0f;
     k.dx = GPU_Direct_Quantize(s.dx, dir_scale);
     k.dy = GPU_Direct_Quantize(s.dy, dir_scale);
     k.dz = GPU_Direct_Quantize(s.dz, dir_scale);
@@ -2716,34 +2751,52 @@ static bool GPU_Direct_SourceAffectsFace(
     const qvec3f &normal,
     bool twosided)
 {
+    if (!GPU_Direct_SourceCullEnabled()) {
+        return true;
+    }
     if (twosided) {
         return true;
     }
 
+    const float quality = GPU_Direct_SourceCullQuality();
+
     if (src.type == 1) {
+        // Sun normal culling is the quality-sensitive part. At max quality we keep
+        // all sun jitter directions for every face. Lower quality progressively
+        // removes back-facing sun directions.
+        if (quality >= 0.999f) {
+            return true;
+        }
         const qvec3f dir{src.dx, src.dy, src.dz};
-        return qv::dot(normal, dir) > -0.01f;
+        const float threshold = -0.50f + (0.55f * (1.0f - quality)); // q=0 -> 0.05, q=1 -> -0.50
+        return qv::dot(normal, dir) > threshold;
     }
 
     const float radius = GPU_Direct_EffectivePointRadius(src);
     if (radius < static_cast<float>(MAX_SKY_DIST) * 0.999f) {
+        // More quality = more radius padding = less chance of missing a faint edge case.
+        const float padded_radius = radius * (1.0f + 3.0f * quality) + 256.0f * quality;
         const float d2 = GPU_Direct_PointAABBDistance2(src, mins, maxs);
-        if (d2 > radius * radius) {
+        if (d2 > padded_radius * padded_radius) {
             return false;
         }
     }
 
-    // Conservative face-normal cull for point lights: use vector from face center to light.
-    const qvec3f center{
-        (mins[0] + maxs[0]) * 0.5f,
-        (mins[1] + maxs[1]) * 0.5f,
-        (mins[2] + maxs[2]) * 0.5f};
-    qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]};
-    const float to_light_len2 = qv::dot(to_light, to_light);
-    if (to_light_len2 > 0.0001f) {
-        to_light = to_light * (1.0f / std::sqrt(to_light_len2));
-        if (qv::dot(normal, to_light) <= -0.10f) {
-            return false;
+    // Conservative face-normal cull for point lights. At max quality this is disabled;
+    // lower quality allows removing back-facing points.
+    if (quality < 0.999f) {
+        const qvec3f center{
+            (mins[0] + maxs[0]) * 0.5f,
+            (mins[1] + maxs[1]) * 0.5f,
+            (mins[2] + maxs[2]) * 0.5f};
+        qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]};
+        const float to_light_len2 = qv::dot(to_light, to_light);
+        if (to_light_len2 > 0.0001f) {
+            to_light = to_light * (1.0f / std::sqrt(to_light_len2));
+            const float threshold = -0.75f + (0.65f * (1.0f - quality)); // q=0 -> -0.10, q=1 -> -0.75
+            if (qv::dot(normal, to_light) <= threshold) {
+                return false;
+            }
         }
     }
 
@@ -2828,9 +2881,16 @@ static bool GPU_DirectQueue_BuildSourcesLocked()
         else ++merged_point_sources;
     }
 
-    logging::print("GPU direct phase: queued {} merged direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge scale {}).\n",
+    const bool sun_merge_enabled = light_options.gpusunmerge.value();
+    const float sun_merge_quality = GPU_Direct_SunMergeQuality();
+    const float sun_merge_scale = GPU_Direct_SunMergeScale();
+    const bool source_cull_enabled = GPU_Direct_SourceCullEnabled();
+    const float source_cull_quality = GPU_Direct_SourceCullQuality();
+    logging::print("GPU direct phase: queued {} direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge {}; quality {:.2f}; scale {:.1f}; source cull {}; quality {:.2f}).\n",
         g_gpu_direct_sources.size(), raw_sources, raw_point_sources, raw_sun_sources,
-        merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), GPU_DIRECT_SUN_DIR_MERGE_SCALE);
+        merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(),
+        sun_merge_enabled ? "on" : "off", sun_merge_quality, sun_merge_scale,
+        source_cull_enabled ? "on" : "off", source_cull_quality);
     return true;
 }