diff --git a/include/light/light.hh b/include/light/light.hh index 0e3cb7f5..177cf5f6 100644 --- a/include/light/light.hh +++ b/include/light/light.hh @@ -396,6 +396,11 @@ public: setting_bool novanilla; setting_scalar gate; setting_int32 sunsamples; + setting_bool gpusunmerge; // -gpusunmerge: approximate-merge nearby GPU sun jitter rays + setting_scalar gpusunmergequality; // -gpusunmergequality: 0 fast/rough, 1 slow/high quality + setting_bool gpusourcecull; // -gpusourcecull: use approximate GPU per-face source culling + setting_scalar gpusourcecullquality; // -gpusourcecullquality: 0 fast/aggressive, 1 safest/conservative + settings::setting_bool gpu; // -gpu: use Vulkan GPU ray-query backend when available setting_bool arghradcompat; setting_bool nolighting; setting_vec3 debugface; diff --git a/include/light/trace_embree.hh b/include/light/trace_embree.hh index 3f4856c5..997bc404 100644 --- a/include/light/trace_embree.hh +++ b/include/light/trace_embree.hh @@ -18,6 +18,7 @@ */ #pragma once +#include #include #include @@ -280,6 +281,52 @@ public: if (!_rays.size()) return; +#if defined(HAVE_GPU_LIGHT) + // Optional large-batch occlusion path. v5 direct lighting uses + // direct_phase.comp; small fallback raystreams stay on Embree. + constexpr size_t GPU_OCCLUSION_MIN_BATCH = 262144; + + if (_rays.size() >= GPU_OCCLUSION_MIN_BATCH && GPU_TraceAvailable()) { + std::vector gpu_rays; + std::vector gpu_results; + + gpu_rays.resize(_rays.size()); + gpu_results.resize(_rays.size()); + + for (size_t i = 0; i < _rays.size(); ++i) { + const auto &src = _rays[i].ray.ray; + auto &dst = gpu_rays[i]; + + dst.origin[0] = src.org_x; + dst.origin[1] = src.org_y; + dst.origin[2] = src.org_z; + dst.tmin = src.tnear; + + dst.direction[0] = src.dir_x; + dst.direction[1] = src.dir_y; + dst.direction[2] = src.dir_z; + dst.tmax = src.tfar; + + dst.shadow_mask = static_cast(shadowmask); + dst.user_index = static_cast(i); + } + + if (gpu_light::trace_occlusion_batch( + self, + static_cast(shadowmask), + gpu_rays.data(), + gpu_results.data(), + gpu_rays.size())) { + for (size_t i = 0; i < _rays.size(); ++i) { + if (gpu_results[i].occluded) { + _rays[i].ray.ray.tfar = -std::abs(_rays[i].ray.ray.tfar); + } + } + return; + } + } +#endif + ray_source_info ctx2(this, self, shadowmask); RTCOccludedArguments embree4_args = ctx2.setup_occluded_arguments(); for (auto &ray : _rays) diff --git a/include/light/trace_gpu.hh b/include/light/trace_gpu.hh new file mode 100644 index 00000000..96ac2f85 --- /dev/null +++ b/include/light/trace_gpu.hh @@ -0,0 +1,152 @@ +/* GPU trace backend + * Prototype overlay generated for Linux/Vulkan ray-query development. + */ +#pragma once + +#include +#include +#include + +struct mbsp_t; +class modelinfo_t; + +#ifndef HAVE_GPU_LIGHT +#define GPU_LIGHT_COMPILED 0 +#else +#define GPU_LIGHT_COMPILED 1 +#endif + +namespace gpu_light { + +struct ray_t { + float origin[3] = {0, 0, 0}; + float tmin = 0.01f; + float direction[3] = {0, 0, 1}; + float tmax = 0.0f; + std::uint32_t shadow_mask = 0xffffffffu; + std::uint32_t user_index = 0; +}; + +struct occlusion_result_t { + std::uint32_t occluded = 0; + std::uint32_t reserved0 = 0; + float transmittance[3] = {1.0f, 1.0f, 1.0f}; +}; + +struct direct_job_t { + float ox = 0, oy = 0, oz = 0, tmin = 0.01f; + float dx = 0, dy = 0, dz = 1, tmax = 0.0f; + float cr = 0, cg = 0, cb = 0, pad0 = 0; + float nr = 0, ng = 0, nb = 0, pad1 = 0; + std::uint32_t sample_index = 0; + std::uint32_t flags = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; +}; + +struct direct_sample_range_t { + std::uint32_t first = 0; + std::uint32_t count = 0; +}; + +struct direct_accum_t { + float cr = 0, cg = 0, cb = 0, pad0 = 0; + float nr = 0, ng = 0, nb = 0, pad1 = 0; + std::uint32_t hit = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; + std::uint32_t reserved2 = 0; +}; + +struct direct_phase_sample_t { + float px = 0, py = 0, pz = 0, occlusion = 1; + float nx = 0, ny = 0, nz = 1, twosided = 0; + std::uint32_t face_index = 0; + std::uint32_t reserved0 = 0; + std::uint32_t reserved1 = 0; + std::uint32_t reserved2 = 0; +}; + +struct direct_phase_face_range_t { + std::uint32_t source_begin = 0; + std::uint32_t source_count = 0; +}; + +struct direct_phase_source_t { + float px = 0, py = 0, pz = 0, light = 0; + float dx = 0, dy = 0, dz = 1, dist = 65536.0f; + float cr = 1, cg = 1, cb = 1, atten = 1; + std::uint32_t type = 0; // 0 = point, 1 = sun + std::uint32_t formula = 0; // light_formula_t for point lights + std::uint32_t flags = 0; // bit 0: dirt + std::uint32_t reserved0 = 0; + float anglescale = 1; + float dirt = 0; + float falloff = 0; + float pad0 = 0; +}; + +using direct_phase_accum_t = direct_accum_t; + + +enum class backend_state_t { + unavailable, + initialized, + failed +}; + +struct stats_t { + std::uint64_t batches = 0; + std::uint64_t rays = 0; + std::uint64_t gpu_batches = 0; + std::uint64_t fallback_batches = 0; +}; + +bool requested(); +backend_state_t state(); +const char *state_string(); +const char *last_error(); +stats_t stats(); + +bool init(const mbsp_t *bsp); +void shutdown(); + +// Returns true when the batch was handled by the GPU backend. Returns false to +// tell the caller to run the existing CPU/Embree path. +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const ray_t *rays, + occlusion_result_t *results, + std::size_t count); + + +bool trace_direct_phase_batch( + const direct_phase_source_t *sources, + std::size_t source_count, + const direct_phase_sample_t *samples, + direct_phase_accum_t *accum, + std::size_t sample_count, + const direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count); + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const direct_job_t *jobs, + std::size_t job_count, + const direct_sample_range_t *ranges, + direct_accum_t *accum, + std::size_t sample_count); + +} // namespace gpu_light + +bool GPU_TraceInit(const mbsp_t *bsp); +void GPU_TraceShutdown(); +bool GPU_TraceAvailable(); +const char *GPU_TraceLastError(); + +// Flushes pending sample-driven direct-light work. +void GPU_DirectQueue_Flush(const mbsp_t *bsp); diff --git a/light/CMakeLists.txt b/light/CMakeLists.txt index 030bde65..73ffcda6 100644 --- a/light/CMakeLists.txt +++ b/light/CMakeLists.txt @@ -1,3 +1,4 @@ +option(LIGHT_ENABLE_VULKAN_GPU "Enable Vulkan GPU ray-query backend for light" OFF) option(SKIP_TBB_INSTALL "Skip TBB Library Installation" OFF) option(SKIP_EMBREE_INSTALL "Skip Embree Library Installation" OFF) @@ -9,7 +10,7 @@ set(LIGHT_INCLUDES ../include/light/bounce.hh ../include/light/surflight.hh ../include/light/ltface.hh - ../include/light/trace.hh + ../include/light/trace.hh ../include/light/trace_gpu.hh ../include/light/write.hh ../include/light/spatialindex.hh ) @@ -47,9 +48,40 @@ endif(embree_FOUND) add_library(liblight STATIC ${LIGHT_SOURCES}) +if (LIGHT_ENABLE_VULKAN_GPU) + find_package(Vulkan REQUIRED) + find_program(GLSLANG_VALIDATOR glslangValidator REQUIRED) + + target_sources(liblight PRIVATE + trace_gpu.cc + trace_gpu_vulkan.cc + ) + target_compile_definitions(liblight PRIVATE HAVE_GPU_LIGHT=1) + target_link_libraries(liblight PRIVATE Vulkan::Vulkan) + + set(GPU_SHADER_SPVS) + foreach(GPU_SHADER_NAME occlusion direct_phase) + set(GPU_SHADER_SRC "${CMAKE_CURRENT_SOURCE_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp") + set(GPU_SHADER_SPV "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders/${GPU_SHADER_NAME}.comp.spv") + add_custom_command( + OUTPUT "${GPU_SHADER_SPV}" + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/gpu_shaders" + COMMAND "${GLSLANG_VALIDATOR}" -V "${GPU_SHADER_SRC}" -o "${GPU_SHADER_SPV}" + DEPENDS "${GPU_SHADER_SRC}" + VERBATIM) + list(APPEND GPU_SHADER_SPVS "${GPU_SHADER_SPV}") + endforeach() + add_custom_target(light_gpu_shaders DEPENDS ${GPU_SHADER_SPVS}) + add_dependencies(liblight light_gpu_shaders) +endif() + target_link_libraries(liblight PRIVATE common ${CMAKE_THREAD_LIBS_INIT} fmt::fmt jsoncpp_static) add_executable(light main.cc) +if (LIGHT_ENABLE_VULKAN_GPU) + add_dependencies(light light_gpu_shaders) +endif() + target_link_libraries(light PRIVATE common liblight) if (embree_FOUND) diff --git a/light/gpu_shaders/direct_phase.comp b/light/gpu_shaders/direct_phase.comp new file mode 100644 index 00000000..79b7e167 --- /dev/null +++ b/light/gpu_shaders/direct_phase.comp @@ -0,0 +1,202 @@ +#version 460 +#extension GL_EXT_ray_query : require +#extension GL_EXT_scalar_block_layout : require + +layout(local_size_x = 64) in; + +struct GpuDirectPhaseSample { + vec3 position; + float occlusion; + vec3 normal; + float twosided; + uint faceIndex; + uint reserved0; + uint reserved1; + uint reserved2; +}; + +struct GpuDirectPhaseSource { + vec3 position; + float light; + vec3 direction; + float dist; + vec3 color; + float atten; + uint type; // 0 = point, 1 = sun + uint formula; // light_formula_t for point lights + uint flags; // bit 0 = dirt + uint reserved0; + float anglescale; + float dirt; + float falloff; + float pad0; +}; + +struct GpuDirectPhaseFaceRange { + uint sourceBegin; + uint sourceCount; +}; + +struct GpuDirectAccum { + vec3 color; + float pad0; + vec3 normal; + float pad1; + uint hit; + uint reserved0; + uint reserved1; + uint reserved2; +}; + +layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS; +layout(std430, set = 0, binding = 1) readonly buffer Samples { GpuDirectPhaseSample samples[]; } sampleBuffer; +layout(std430, set = 0, binding = 2) readonly buffer Sources { GpuDirectPhaseSource sources[]; } sourceBuffer; +layout(std430, set = 0, binding = 3) readonly buffer FaceRanges { GpuDirectPhaseFaceRange ranges[]; } faceRangeBuffer; +layout(std430, set = 0, binding = 4) readonly buffer FaceSourceIndices { uint indices[]; } faceSourceIndexBuffer; +layout(std430, set = 0, binding = 5) writeonly buffer Accum { GpuDirectAccum accum[]; } accumBuffer; + +layout(push_constant) uniform PushConstants { + uint sampleCount; + uint sourceCount; + uint flags; + uint reserved0; +} pc; + +bool occluded(vec3 origin, vec3 dir, float tmax) { + rayQueryEXT rq; + rayQueryInitializeEXT( + rq, + sceneAS, + gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT, + 0xff, + origin, + 0.01, + dir, + max(tmax, 0.02)); + + while (rayQueryProceedEXT(rq)) {} + + return rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; +} + +float point_light_value(uint formula, float light, float atten, float dist, float falloff) { + float d = max(dist, 1.0); + float a = max(atten, 0.0001); + + if (formula == 1u) { + return light * 128.0 / (d * a); + } else if (formula == 2u) { + return light * 128.0 * 128.0 / (d * d * a); + } else if (formula == 3u) { + return light; + } else if (formula == 5u) { + float da = d + 128.0; + return light * 128.0 * 128.0 / (da * da * a); + } else if (formula == 6u) { + float qd = max(d, 16.0); + return light * 128.0 * 128.0 / (qd * qd * a); + } + + if (falloff > 0.0) { + return light * max(0.0, 1.0 - d / falloff); + } + return light - d * a; +} + +void clear_accum(uint sample_id) { + accumBuffer.accum[sample_id].color = vec3(0.0); + accumBuffer.accum[sample_id].pad0 = 0.0; + accumBuffer.accum[sample_id].normal = vec3(0.0); + accumBuffer.accum[sample_id].pad1 = 0.0; + accumBuffer.accum[sample_id].hit = 0u; + accumBuffer.accum[sample_id].reserved0 = 0u; + accumBuffer.accum[sample_id].reserved1 = 0u; + accumBuffer.accum[sample_id].reserved2 = 0u; +} + +void main() { + uint sample_id = gl_GlobalInvocationID.x; + if (sample_id >= pc.sampleCount) { + return; + } + + GpuDirectPhaseSample s = sampleBuffer.samples[sample_id]; + if (s.twosided < -0.5) { + clear_accum(sample_id); + return; + } + + GpuDirectPhaseFaceRange r = faceRangeBuffer.ranges[s.faceIndex]; + if (r.sourceCount == 0u) { + clear_accum(sample_id); + return; + } + + vec3 total_color = vec3(0.0); + vec3 total_normal = vec3(0.0); + uint any_hit = 0u; + + for (uint local_i = 0u; local_i < r.sourceCount; ++local_i) { + uint source_id = faceSourceIndexBuffer.indices[r.sourceBegin + local_i]; + if (source_id >= pc.sourceCount) { + continue; + } + GpuDirectPhaseSource l = sourceBuffer.sources[source_id]; + + vec3 ray_dir; + float ray_dist; + float value; + vec3 ncontrib_dir; + + if (l.type == 1u) { + ray_dir = l.direction; + ray_dist = l.dist; + float angle = dot(ray_dir, s.normal); + if (s.twosided > 0.5 && angle < 0.0) angle = -angle; + angle = max(0.0, angle); + angle = (1.0 - l.anglescale) + l.anglescale * angle; + value = l.light * angle; + ncontrib_dir = ray_dir; + } else { + vec3 to_light = l.position - s.position; + ray_dist = length(to_light); + if (ray_dist <= 0.01) { + continue; + } + ray_dir = to_light / ray_dist; + float angle = dot(ray_dir, s.normal); + if (s.twosided > 0.5 && angle < 0.0) angle = -angle; + if (angle <= 0.0) { + continue; + } + angle = (1.0 - l.anglescale) + l.anglescale * max(0.0, angle); + value = point_light_value(l.formula, l.light, l.atten, ray_dist, l.falloff) * angle; + ncontrib_dir = ray_dir; + } + + if (value <= 0.0) { + continue; + } + + float dirt_scale = ((l.flags & 1u) != 0u) ? clamp(s.occlusion, 0.0, 1.0) : 1.0; + value *= dirt_scale; + if (value <= 0.0) { + continue; + } + + if (!occluded(s.position, ray_dir, ray_dist)) { + total_color += l.color * (value / 255.0); + total_normal += ncontrib_dir * value; + any_hit = 1u; + } + } + + accumBuffer.accum[sample_id].color = total_color; + accumBuffer.accum[sample_id].pad0 = 0.0; + accumBuffer.accum[sample_id].normal = total_normal; + accumBuffer.accum[sample_id].pad1 = 0.0; + accumBuffer.accum[sample_id].hit = any_hit; + accumBuffer.accum[sample_id].reserved0 = 0u; + accumBuffer.accum[sample_id].reserved1 = 0u; + accumBuffer.accum[sample_id].reserved2 = 0u; +} diff --git a/light/gpu_shaders/occlusion.comp b/light/gpu_shaders/occlusion.comp new file mode 100644 index 00000000..d60d4fe8 --- /dev/null +++ b/light/gpu_shaders/occlusion.comp @@ -0,0 +1,59 @@ +#version 460 +#extension GL_EXT_ray_query : require +#extension GL_EXT_scalar_block_layout : require + +layout(local_size_x = 128) in; + +struct GpuRay { + float ox; float oy; float oz; float tmin; + float dx; float dy; float dz; float tmax; + uint shadowMask; + uint userIndex; +}; + +struct GpuOcclusionResult { + uint occluded; + uint reserved0; + float tr; + float tg; + float tb; +}; + +layout(push_constant) uniform PushConstants { + uint rayCount; + uint flags; +} pc; + +layout(set = 0, binding = 0) uniform accelerationStructureEXT sceneAS; +layout(scalar, set = 0, binding = 1) readonly buffer RayBuffer { GpuRay rays[]; } rayBuffer; +layout(scalar, set = 0, binding = 2) writeonly buffer ResultBuffer { GpuOcclusionResult results[]; } resultBuffer; + +void main() { + uint i = gl_GlobalInvocationID.x; + if (i >= pc.rayCount) { + return; + } + + GpuRay r = rayBuffer.rays[i]; + + rayQueryEXT rq; + rayQueryInitializeEXT( + rq, + sceneAS, + gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsOpaqueEXT, + 0xff, + vec3(r.ox, r.oy, r.oz), + r.tmin, + normalize(vec3(r.dx, r.dy, r.dz)), + r.tmax); + + while (rayQueryProceedEXT(rq)) { + } + + bool hit = rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; + resultBuffer.results[i].occluded = hit ? 1u : 0u; + resultBuffer.results[i].reserved0 = 0u; + resultBuffer.results[i].tr = 1.0; + resultBuffer.results[i].tg = 1.0; + resultBuffer.results[i].tb = 1.0; +} diff --git a/light/light.cc b/light/light.cc index a065f820..b20e4d3d 100644 --- a/light/light.cc +++ b/light/light.cc @@ -31,6 +31,7 @@ #include #include // for facesup_t #include +#include #include #include @@ -292,6 +293,11 @@ light_settings::light_settings() novanilla{this, "novanilla", false, &experimental_group, "implies -bspxlit; don't write vanilla lighting"}, gate{this, "gate", LIGHT_EQUAL_EPSILON, &performance_group, "cutoff lights at this brightness level"}, sunsamples{this, "sunsamples", 64, 8, 2048, &performance_group, "set samples for _sunlight2, default 64"}, + gpusunmerge{this, "gpusunmerge", false, &performance_group, "approximate-merge nearby GPU sun jitter rays"}, + gpusunmergequality{this, "gpusunmergequality", 0.75f, 0.0f, 1.0f, &performance_group, "GPU sun merge quality: 0 fast/rough, 1 slow/high quality"}, + gpusourcecull{this, "gpusourcecull", false, &performance_group, "use approximate GPU per-face source culling"}, + gpusourcecullquality{this, "gpusourcecullquality", 1.0f, 0.0f, 1.0f, &performance_group, "GPU source culling quality: 0 fast/aggressive, 1 safest/conservative"}, + gpu{this, "gpu", false, &performance_group, "use Vulkan GPU ray-query backend for batched visibility rays"}, arghradcompat{this, "arghradcompat", false, &output_group, "enable compatibility for Arghrad-specific keys"}, nolighting{this, "nolighting", false, &output_group, "don't output main world lighting (Q2RTX)"}, debugface{this, "debugface", std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), @@ -1339,6 +1345,16 @@ int light_main(int argc, const char **argv) FindDebugVert(&bsp); Embree_TraceInit(&bsp); +#if defined(HAVE_GPU_LIGHT) + if (light_options.gpu.value()) { + if (!GPU_TraceInit(&bsp)) { + logging::print("WARNING: -gpu requested, but GPU trace init failed: {}\n", GPU_TraceLastError()); + } else { + logging::print("GPU light tracing enabled.\n"); + } + } +#endif + if (light_options.debugmode == debugmodes::phong_obj) { CalculateVertexNormals(&bsp); @@ -1409,7 +1425,14 @@ int light_main(int argc, const char **argv) logging::print("{} empty lightmaps\n", static_cast(fully_transparent_lightmaps)); logging::close(); - return 0; + +#if defined(HAVE_GPU_LIGHT) + if (light_options.gpu.value()) { + GPU_DirectQueue_Flush(&bsp); + GPU_TraceShutdown(); + } +#endif +return 0; } int light_main(const std::vector &args) diff --git a/light/ltface.cc b/light/ltface.cc index 4634e54e..649312ed 100644 --- a/light/ltface.cc +++ b/light/ltface.cc @@ -18,6 +18,11 @@ */ #include +#include +#include +#include +#include +#include #include #include @@ -2556,6 +2561,512 @@ lightsurf_t CreateLightmapSurface(const mbsp_t *bsp, const mface_t *face, const return Lightsurf_Init(modelinfo, cfg, face, bsp, facesup, facesup_decoupled); } + + +#if defined(HAVE_GPU_LIGHT) +namespace { +struct gpu_direct_face_record_t { + lightsurf_t *lightsurf = nullptr; + lightmapdict_t *lightmaps = nullptr; + std::size_t first_sample = 0; + std::size_t sample_count = 0; +}; + +std::mutex g_gpu_direct_queue_mutex; +std::vector g_gpu_direct_samples; +std::vector g_gpu_direct_sources; +std::vector g_gpu_direct_face_ranges; +std::vector g_gpu_direct_face_source_indices; +std::vector g_gpu_direct_faces; +bool g_gpu_direct_sources_built = false; +bool g_gpu_direct_disabled = false; + +static constexpr std::size_t GPU_DIRECT_FLUSH_SAMPLES = 1024ull * 1024ull; + +struct gpu_direct_source_key_t { + std::uint32_t type = 0; + std::uint32_t formula = 0; + std::uint32_t flags = 0; + int px = 0, py = 0, pz = 0; + int dx = 0, dy = 0, dz = 0; + int atten = 0, anglescale = 0, falloff = 0; +}; + +// Optional approximate sun-direction merge. Disabled by default for final quality. +// When enabled with -gpusunmerge, -gpusunmergequality maps to a direction quantization scale: +// 0.00 => 16 fastest/roughest +// 0.50 => 256 balanced preview +// 1.00 => 4096 best quality/least merging +static float GPU_Direct_SunMergeQuality() +{ + float q = light_options.gpusunmergequality.value(); + if (!std::isfinite(q)) { + q = 0.75f; + } + if (q < 0.0f) q = 0.0f; + if (q > 1.0f) q = 1.0f; + return q; +} + +static float GPU_Direct_SunMergeScale() +{ + if (!light_options.gpusunmerge.value()) { + return 65536.0f; // effectively exact; preserves final-quality sun jitter + } + return 16.0f * std::pow(256.0f, GPU_Direct_SunMergeQuality()); +} + +static float GPU_Direct_SourceCullQuality() +{ + float q = light_options.gpusourcecullquality.value(); + if (!std::isfinite(q)) { + q = 1.0f; + } + if (q < 0.0f) q = 0.0f; + if (q > 1.0f) q = 1.0f; + return q; +} + +static bool GPU_Direct_SourceCullEnabled() +{ + return light_options.gpusourcecull.value(); +} + +static int GPU_Direct_Quantize(float v, float scale = 4096.0f) +{ + return static_cast(std::lround(v * scale)); +} + +static gpu_direct_source_key_t GPU_Direct_SourceKey(const gpu_light::direct_phase_source_t &s) +{ + gpu_direct_source_key_t k{}; + k.type = s.type; + k.formula = s.formula; + k.flags = s.flags; + k.px = GPU_Direct_Quantize(s.px); + k.py = GPU_Direct_Quantize(s.py); + k.pz = GPU_Direct_Quantize(s.pz); + const float dir_scale = (s.type == 1u) ? GPU_Direct_SunMergeScale() : 4096.0f; + k.dx = GPU_Direct_Quantize(s.dx, dir_scale); + k.dy = GPU_Direct_Quantize(s.dy, dir_scale); + k.dz = GPU_Direct_Quantize(s.dz, dir_scale); + k.atten = GPU_Direct_Quantize(s.atten, 1024.0f); + k.anglescale = GPU_Direct_Quantize(s.anglescale, 1024.0f); + k.falloff = GPU_Direct_Quantize(s.falloff, 1024.0f); + return k; +} + +static bool GPU_Direct_SourceKeyEquals(const gpu_direct_source_key_t &a, const gpu_direct_source_key_t &b) +{ + return a.type == b.type && a.formula == b.formula && a.flags == b.flags && + a.px == b.px && a.py == b.py && a.pz == b.pz && + a.dx == b.dx && a.dy == b.dy && a.dz == b.dz && + a.atten == b.atten && a.anglescale == b.anglescale && a.falloff == b.falloff; +} + +static void GPU_Direct_MergeInto(gpu_light::direct_phase_source_t &dst, const gpu_light::direct_phase_source_t &src) +{ + const float a = std::max(dst.light, 0.0f); + const float b = std::max(src.light, 0.0f); + const float total = a + b; + if (total <= 0.0f) { + return; + } + + dst.cr = (dst.cr * a + src.cr * b) / total; + dst.cg = (dst.cg * a + src.cg * b) / total; + dst.cb = (dst.cb * a + src.cb * b) / total; + + if (dst.type == 1u) { + qvec3f d{dst.dx * a + src.dx * b, dst.dy * a + src.dy * b, dst.dz * a + src.dz * b}; + const float len2 = qv::dot(d, d); + if (len2 > 0.0001f) { + d = d * (1.0f / std::sqrt(len2)); + dst.dx = d[0]; + dst.dy = d[1]; + dst.dz = d[2]; + } + } + + dst.light = total; +} + +static void GPU_Direct_AddMergedSource( + std::vector &keys, + const gpu_light::direct_phase_source_t &src) +{ + const auto key = GPU_Direct_SourceKey(src); + for (std::size_t i = 0; i < keys.size(); ++i) { + if (GPU_Direct_SourceKeyEquals(keys[i], key)) { + GPU_Direct_MergeInto(g_gpu_direct_sources[i], src); + return; + } + } + keys.push_back(key); + g_gpu_direct_sources.push_back(src); +} + +static float GPU_Direct_EffectivePointRadius(const gpu_light::direct_phase_source_t &src) +{ + if (src.type == 1) { + return MAX_SKY_DIST; + } + if (src.formula == 3u) { // LF_INFINITE + return MAX_SKY_DIST; + } + if (src.falloff > 0.0f) { + return std::min(src.falloff, static_cast(MAX_SKY_DIST)); + } + // Conservative only for LF_LINEAR/default: value = light - distance * atten. + // Inverse formulas are treated as global unless they provide _falloff. + if (src.formula == 0u && src.atten > 0.0001f && src.light > 0.0f) { + return std::min(src.light / src.atten, static_cast(MAX_SKY_DIST)); + } + return MAX_SKY_DIST; +} + +static float GPU_Direct_PointAABBDistance2( + const gpu_light::direct_phase_source_t &src, + const qvec3f &mins, + const qvec3f &maxs) +{ + const float p[3] = {src.px, src.py, src.pz}; + float d2 = 0.0f; + for (int axis = 0; axis < 3; ++axis) { + if (p[axis] < mins[axis]) { + const float d = mins[axis] - p[axis]; + d2 += d * d; + } else if (p[axis] > maxs[axis]) { + const float d = p[axis] - maxs[axis]; + d2 += d * d; + } + } + return d2; +} + +static bool GPU_Direct_SourceAffectsFace( + const gpu_light::direct_phase_source_t &src, + const qvec3f &mins, + const qvec3f &maxs, + const qvec3f &normal, + bool twosided) +{ + if (!GPU_Direct_SourceCullEnabled()) { + return true; + } + if (twosided) { + return true; + } + + const float quality = GPU_Direct_SourceCullQuality(); + + if (src.type == 1) { + // Sun normal culling is the quality-sensitive part. At max quality we keep + // all sun jitter directions for every face. Lower quality progressively + // removes back-facing sun directions. + if (quality >= 0.999f) { + return true; + } + const qvec3f dir{src.dx, src.dy, src.dz}; + const float threshold = -0.50f + (0.55f * (1.0f - quality)); // q=0 -> 0.05, q=1 -> -0.50 + return qv::dot(normal, dir) > threshold; + } + + const float radius = GPU_Direct_EffectivePointRadius(src); + if (radius < static_cast(MAX_SKY_DIST) * 0.999f) { + // More quality = more radius padding = less chance of missing a faint edge case. + const float padded_radius = radius * (1.0f + 3.0f * quality) + 256.0f * quality; + const float d2 = GPU_Direct_PointAABBDistance2(src, mins, maxs); + if (d2 > padded_radius * padded_radius) { + return false; + } + } + + // Conservative face-normal cull for point lights. At max quality this is disabled; + // lower quality allows removing back-facing points. + if (quality < 0.999f) { + const qvec3f center{ + (mins[0] + maxs[0]) * 0.5f, + (mins[1] + maxs[1]) * 0.5f, + (mins[2] + maxs[2]) * 0.5f}; + qvec3f to_light{src.px - center[0], src.py - center[1], src.pz - center[2]}; + const float to_light_len2 = qv::dot(to_light, to_light); + if (to_light_len2 > 0.0001f) { + to_light = to_light * (1.0f / std::sqrt(to_light_len2)); + const float threshold = -0.75f + (0.65f * (1.0f - quality)); // q=0 -> -0.10, q=1 -> -0.75 + if (qv::dot(normal, to_light) <= threshold) { + return false; + } + } + } + + return true; +} + +static bool GPU_DirectQueue_BuildSourcesLocked() +{ + if (g_gpu_direct_sources_built) { + return !g_gpu_direct_disabled; + } + g_gpu_direct_sources_built = true; + g_gpu_direct_sources.clear(); + std::vector unique_keys; + + std::size_t raw_sources = 0; + std::size_t raw_point_sources = 0; + std::size_t raw_sun_sources = 0; + for (const auto &entity_ptr : GetLights()) { + const light_t *entity = entity_ptr.get(); + if (entity->nostaticlight.value()) continue; + if (entity->light.value() <= 0) continue; + if (entity->sun.value()) continue; + + if (entity->style.value() != 0 || + entity->shadow_channel_mask.value() != CHANNEL_MASK_DEFAULT || + entity->light_channel_mask.value() != CHANNEL_MASK_DEFAULT || + entity->spotlight || entity->projectedmip || + entity->getFormula() == LF_LOCALMIN) { + logging::print("GPU direct phase: unsupported entity light encountered; falling back to CPU direct path.\n"); + g_gpu_direct_disabled = true; + return false; + } + + gpu_light::direct_phase_source_t src{}; + const qvec3f origin = entity->origin.value(); + const qvec3f color = entity->color.value(); + src.px = origin[0]; src.py = origin[1]; src.pz = origin[2]; + src.light = entity->light.value(); + src.dx = 0; src.dy = 0; src.dz = 1; src.dist = 0; + src.cr = color[0]; src.cg = color[1]; src.cb = color[2]; + src.atten = entity->atten.value(); + src.type = 0; + src.formula = static_cast(entity->getFormula()); + src.flags = entity->dirt.value() ? 1u : 0u; + src.anglescale = entity->anglescale.value(); + src.dirt = entity->dirt.value(); + src.falloff = entity->falloff.value(); + ++raw_sources; + ++raw_point_sources; + GPU_Direct_AddMergedSource(unique_keys, src); + } + + for (const sun_t &sun : GetSuns()) { + if (sun.sunlight <= 0) continue; + if (sun.style != 0 || sun.suntexture_value) { + logging::print("GPU direct phase: unsupported sun style/texture encountered; falling back to CPU direct path.\n"); + g_gpu_direct_disabled = true; + return false; + } + qvec3f incoming = qv::normalize(sun.sunvec); + gpu_light::direct_phase_source_t src{}; + src.type = 1; + src.dx = incoming[0]; src.dy = incoming[1]; src.dz = incoming[2]; + src.dist = MAX_SKY_DIST; + src.light = sun.sunlight; + src.cr = sun.sunlight_color[0]; src.cg = sun.sunlight_color[1]; src.cb = sun.sunlight_color[2]; + src.atten = 1; + src.formula = 0; + src.flags = sun.dirt ? 1u : 0u; + src.anglescale = sun.anglescale; + src.dirt = sun.dirt ? 1.0f : 0.0f; + ++raw_sources; + ++raw_sun_sources; + GPU_Direct_AddMergedSource(unique_keys, src); + } + + std::size_t merged_point_sources = 0; + std::size_t merged_sun_sources = 0; + for (const auto &src : g_gpu_direct_sources) { + if (src.type == 1u) ++merged_sun_sources; + else ++merged_point_sources; + } + + const bool sun_merge_enabled = light_options.gpusunmerge.value(); + const float sun_merge_quality = GPU_Direct_SunMergeQuality(); + const float sun_merge_scale = GPU_Direct_SunMergeScale(); + const bool source_cull_enabled = GPU_Direct_SourceCullEnabled(); + const float source_cull_quality = GPU_Direct_SourceCullQuality(); + logging::print("GPU direct phase: queued {} direct sources ({} raw: {} point, {} sun; merged: {} point, {} sun; {} merged away; sun merge {}; quality {:.2f}; scale {:.1f}; source cull {}; quality {:.2f}).\n", + g_gpu_direct_sources.size(), raw_sources, raw_point_sources, raw_sun_sources, + merged_point_sources, merged_sun_sources, raw_sources - g_gpu_direct_sources.size(), + sun_merge_enabled ? "on" : "off", sun_merge_quality, sun_merge_scale, + source_cull_enabled ? "on" : "off", source_cull_quality); + return true; +} + +static std::uint64_t GPU_DirectQueue_ImplicitRayCountLocked() +{ + std::uint64_t implicit_rays = 0; + for (const auto &sample : g_gpu_direct_samples) { + const std::size_t face_index = sample.face_index; + if (face_index < g_gpu_direct_face_ranges.size()) { + implicit_rays += g_gpu_direct_face_ranges[face_index].source_count; + } + } + return implicit_rays; +} + +static bool GPU_DirectQueue_FlushLocked(const mbsp_t *bsp) +{ + if (g_gpu_direct_samples.empty()) { + g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); + return true; + } + + const auto t0 = std::chrono::steady_clock::now(); + std::vector accum(g_gpu_direct_samples.size()); + const bool ok = gpu_light::trace_direct_phase_batch( + g_gpu_direct_sources.data(), + g_gpu_direct_sources.size(), + g_gpu_direct_samples.data(), + accum.data(), + g_gpu_direct_samples.size(), + g_gpu_direct_face_ranges.data(), + g_gpu_direct_face_ranges.size(), + g_gpu_direct_face_source_indices.data(), + g_gpu_direct_face_source_indices.size()); + const auto t1 = std::chrono::steady_clock::now(); + const double gpu_ms = std::chrono::duration_cast(t1 - t0).count() / 1000.0; + + if (!ok) { + g_gpu_direct_disabled = true; + logging::print("ERROR: GPU direct phase dispatch failed: {}\n", GPU_TraceLastError()); + logging::print("ERROR: disabling GPU direct phase for the rest of this run. Re-run without -gpu for guaranteed CPU output.\n"); + g_gpu_direct_samples.clear(); + g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); + return false; + } + + for (const auto &rec : g_gpu_direct_faces) { + if (!rec.lightsurf || !rec.lightmaps || rec.sample_count == 0) { + continue; + } + lightmap_t *lightmap = Lightmap_ForStyle(rec.lightmaps, 0, rec.lightsurf); + bool hit = false; + for (std::size_t i = 0; i < rec.sample_count; ++i) { + const std::size_t gi = rec.first_sample + i; + if (!accum[gi].hit) continue; + const qvec3f color{accum[gi].cr, accum[gi].cg, accum[gi].cb}; + const qvec3f normalcontrib{accum[gi].nr, accum[gi].ng, accum[gi].nb}; + lightsample_t &sample = lightmap->samples[i]; + sample.color += color; + sample.direction += normalcontrib; + lightmap->bounce_color += color; + hit = true; + } + if (hit) { + Lightmap_Save(bsp, rec.lightmaps, rec.lightsurf, lightmap, 0); + } + } + + const std::uint64_t implicit_rays = GPU_DirectQueue_ImplicitRayCountLocked(); + logging::print("GPU direct phase: flushed {} samples, {} unique sources, {} face-source refs = {} implicit rays in {:.3f} ms\n", + g_gpu_direct_samples.size(), g_gpu_direct_sources.size(), g_gpu_direct_face_source_indices.size(), implicit_rays, gpu_ms); + + g_gpu_direct_samples.clear(); + g_gpu_direct_faces.clear(); + g_gpu_direct_face_ranges.clear(); + g_gpu_direct_face_source_indices.clear(); + return true; +} +} // namespace + +void GPU_DirectQueue_Flush(const mbsp_t *bsp) +{ + std::lock_guard lock(g_gpu_direct_queue_mutex); + GPU_DirectQueue_FlushLocked(bsp); +} + +static bool GPU_DirectQueue_AddFace(const mbsp_t *bsp, lightsurf_t *lightsurf, lightmapdict_t *lightmaps) +{ + if (!GPU_TraceAvailable() || g_gpu_direct_disabled || !lightsurf || !lightmaps) { + return false; + } + if (!(lightsurf->object_channel_mask & CHANNEL_MASK_DEFAULT)) { + return true; + } + const std::size_t sample_count = lightsurf->samples.size(); + if (!sample_count) { + return true; + } + + std::lock_guard lock(g_gpu_direct_queue_mutex); + if (!GPU_DirectQueue_BuildSourcesLocked()) { + return false; + } + if (g_gpu_direct_sources.empty()) { + return true; + } + + qvec3f mins{std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; + qvec3f maxs{-std::numeric_limits::max(), -std::numeric_limits::max(), -std::numeric_limits::max()}; + qvec3f normal_sum{0, 0, 0}; + std::size_t valid_samples = 0; + for (const auto &sample : lightsurf->samples) { + if (sample.occluded) { + continue; + } + for (int axis = 0; axis < 3; ++axis) { + mins[axis] = std::min(mins[axis], sample.point[axis]); + maxs[axis] = std::max(maxs[axis], sample.point[axis]); + } + normal_sum += sample.normal; + ++valid_samples; + } + if (valid_samples == 0) { + return true; + } + + qvec3f face_normal = lightsurf->snormal; + const float normal_len2 = qv::dot(normal_sum, normal_sum); + if (normal_len2 > 0.0001f) { + face_normal = normal_sum * (1.0f / std::sqrt(normal_len2)); + } + + const std::uint32_t face_index = static_cast(g_gpu_direct_face_ranges.size()); + gpu_light::direct_phase_face_range_t face_range{}; + face_range.source_begin = static_cast(g_gpu_direct_face_source_indices.size()); + + for (std::uint32_t source_index = 0; source_index < g_gpu_direct_sources.size(); ++source_index) { + if (GPU_Direct_SourceAffectsFace(g_gpu_direct_sources[source_index], mins, maxs, face_normal, lightsurf->twosided)) { + g_gpu_direct_face_source_indices.push_back(source_index); + } + } + + face_range.source_count = static_cast(g_gpu_direct_face_source_indices.size()) - face_range.source_begin; + if (face_range.source_count == 0) { + return true; + } + g_gpu_direct_face_ranges.push_back(face_range); + + const std::size_t first_sample = g_gpu_direct_samples.size(); + g_gpu_direct_faces.push_back(gpu_direct_face_record_t{lightsurf, lightmaps, first_sample, sample_count}); + + for (const auto &sample : lightsurf->samples) { + gpu_light::direct_phase_sample_t s{}; + s.face_index = face_index; + if (!sample.occluded) { + s.px = sample.point[0]; s.py = sample.point[1]; s.pz = sample.point[2]; + s.nx = sample.normal[0]; s.ny = sample.normal[1]; s.nz = sample.normal[2]; + s.occlusion = sample.occlusion; + s.twosided = lightsurf->twosided ? 1.0f : 0.0f; + } else { + s.twosided = -1.0f; + } + g_gpu_direct_samples.push_back(s); + } + + if (g_gpu_direct_samples.size() >= GPU_DIRECT_FLUSH_SAMPLES) { + GPU_DirectQueue_FlushLocked(bsp); + } + return true; +} +#endif + /* * ============ * LightFace @@ -2587,7 +3098,10 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings:: /* positive lights */ if (!(modelinfo->lightignore.value() || extended_flags.light_ignore)) { - for (const auto &entity : GetLights()) { + #if defined(HAVE_GPU_LIGHT) + if (!GPU_DirectQueue_AddFace(bsp, &lightsurf, lightmaps)) { +#endif +for (const auto &entity : GetLights()) { if (entity->getFormula() == LF_LOCALMIN) continue; if (entity->nostaticlight.value()) @@ -2598,6 +3112,9 @@ void DirectLightFace(const mbsp_t *bsp, lightsurf_t &lightsurf, const settings:: for (const sun_t &sun : GetSuns()) if (sun.sunlight > 0) LightFace_Sky(bsp, &sun, &lightsurf, lightmaps); +#if defined(HAVE_GPU_LIGHT) + } +#endif // mxd. Add surface lights... // FIXME: negative surface lights diff --git a/light/trace_gpu.cc b/light/trace_gpu.cc new file mode 100644 index 00000000..1f5a1732 --- /dev/null +++ b/light/trace_gpu.cc @@ -0,0 +1,248 @@ +#include + +#include +#include +#include + +#if defined(HAVE_GPU_LIGHT) +namespace gpu_light::vulkan_backend { +bool init(const mbsp_t *bsp, std::string &error); +void shutdown(); +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::ray_t *rays, + gpu_light::occlusion_result_t *results, + std::size_t count, + std::string &error); + +bool trace_direct_phase_batch( + const gpu_light::direct_phase_source_t *sources, + std::size_t source_count, + const gpu_light::direct_phase_sample_t *samples, + gpu_light::direct_phase_accum_t *accum, + std::size_t sample_count, + const gpu_light::direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count, + std::string &error); + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::direct_job_t *jobs, + std::size_t job_count, + const gpu_light::direct_sample_range_t *ranges, + gpu_light::direct_accum_t *accum, + std::size_t sample_count, + std::string &error); +} // namespace gpu_light::vulkan_backend +#endif + +namespace gpu_light { +namespace { +std::mutex g_mutex; +backend_state_t g_state = backend_state_t::unavailable; +std::string g_last_error; +stats_t g_stats; +} // namespace + +bool requested() { + // Keeping this function independent avoids pulling all light settings into this TU. + return true; +} + +backend_state_t state() { + std::lock_guard lock(g_mutex); + return g_state; +} + +const char *state_string() { + switch (state()) { + case backend_state_t::unavailable: return "unavailable"; + case backend_state_t::initialized: return "initialized"; + case backend_state_t::failed: return "failed"; + } + return "unknown"; +} + +const char *last_error() { + std::lock_guard lock(g_mutex); + return g_last_error.c_str(); +} + +stats_t stats() { + std::lock_guard lock(g_mutex); + return g_stats; +} + +bool init(const mbsp_t *bsp) { + std::lock_guard lock(g_mutex); +#if defined(HAVE_GPU_LIGHT) + g_last_error.clear(); + if (vulkan_backend::init(bsp, g_last_error)) { + g_state = backend_state_t::initialized; + return true; + } + g_state = backend_state_t::failed; + return false; +#else + (void)bsp; + g_last_error = "light was built without LIGHT_ENABLE_VULKAN_GPU=ON"; + g_state = backend_state_t::unavailable; + return false; +#endif +} + +void shutdown() { + std::lock_guard lock(g_mutex); +#if defined(HAVE_GPU_LIGHT) + vulkan_backend::shutdown(); +#endif + g_state = backend_state_t::unavailable; +} + +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const ray_t *rays, + occlusion_result_t *results, + std::size_t count) { + if (!rays || !results || count == 0) { + return true; + } + + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += count; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_occlusion_batch(self, shadow_mask, rays, results, count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + (void)self; + (void)shadow_mask; + return false; +#endif +} + + +bool trace_direct_phase_batch( + const direct_phase_source_t *sources, + std::size_t source_count, + const direct_phase_sample_t *samples, + direct_phase_accum_t *accum, + std::size_t sample_count, + const direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count) { + if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0) { + return true; + } + + std::uint64_t implicit_rays = 0; + for (std::size_t i = 0; i < face_range_count; ++i) { + implicit_rays += face_ranges[i].source_count; + } + if (implicit_rays == 0 || face_source_index_count == 0) { + return true; + } + implicit_rays *= static_cast(sample_count) / static_cast(face_range_count); + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += implicit_rays; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_direct_phase_batch( + sources, source_count, samples, accum, sample_count, + face_ranges, face_range_count, face_source_indices, face_source_index_count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + return false; +#endif +} + + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const direct_job_t *jobs, + std::size_t job_count, + const direct_sample_range_t *ranges, + direct_accum_t *accum, + std::size_t sample_count) { + if (!jobs || !ranges || !accum || job_count == 0 || sample_count == 0) { + return true; + } + + { + std::lock_guard lock(g_mutex); + g_stats.batches++; + g_stats.rays += job_count; + if (g_state != backend_state_t::initialized) { + g_stats.fallback_batches++; + return false; + } + } + +#if defined(HAVE_GPU_LIGHT) + std::string error; + const bool ok = vulkan_backend::trace_direct_accumulate_batch( + self, shadow_mask, jobs, job_count, ranges, accum, sample_count, error); + std::lock_guard lock(g_mutex); + if (ok) { + g_stats.gpu_batches++; + return true; + } + g_stats.fallback_batches++; + if (!error.empty()) { + g_last_error = error; + } + return false; +#else + (void)self; + (void)shadow_mask; + return false; +#endif +} + +} // namespace gpu_light + +bool GPU_TraceInit(const mbsp_t *bsp) { return gpu_light::init(bsp); } +void GPU_TraceShutdown() { gpu_light::shutdown(); } +bool GPU_TraceAvailable() { return gpu_light::state() == gpu_light::backend_state_t::initialized; } +const char *GPU_TraceLastError() { return gpu_light::last_error(); } diff --git a/light/trace_gpu_vulkan.cc b/light/trace_gpu_vulkan.cc new file mode 100644 index 00000000..da78638e --- /dev/null +++ b/light/trace_gpu_vulkan.cc @@ -0,0 +1,1343 @@ +#include + +#if defined(HAVE_GPU_LIGHT) + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#include +#endif + +namespace gpu_light::vulkan_backend { +namespace { + +struct buffer_t { + VkBuffer buffer = VK_NULL_HANDLE; + VkDeviceMemory memory = VK_NULL_HANDLE; + VkDeviceSize size = 0; +}; + +struct as_t { + VkAccelerationStructureKHR as = VK_NULL_HANDLE; + buffer_t storage; + VkDeviceAddress address = 0; +}; + +struct vertex_t { + float x, y, z; +}; + +struct gpu_ray_host_t { + float ox, oy, oz, tmin; + float dx, dy, dz, tmax; + std::uint32_t shadow_mask; + std::uint32_t user_index; +}; + +struct gpu_result_host_t { + std::uint32_t occluded; + std::uint32_t reserved0; + float tr, tg, tb; +}; + +struct gpu_direct_job_host_t { + float ox, oy, oz, tmin; + float dx, dy, dz, tmax; + float cr, cg, cb, pad0; + float nr, ng, nb, pad1; + std::uint32_t sample_index; + std::uint32_t flags; + std::uint32_t reserved0; + std::uint32_t reserved1; +}; + +struct gpu_direct_range_host_t { + std::uint32_t first; + std::uint32_t count; +}; + +struct gpu_direct_accum_host_t { + float cr, cg, cb, pad0; + float nr, ng, nb, pad1; + std::uint32_t hit; + std::uint32_t reserved0; + std::uint32_t reserved1; + std::uint32_t reserved2; +}; + + + +struct gpu_direct_phase_sample_host_t { + float px, py, pz, occlusion; + float nx, ny, nz, twosided; + std::uint32_t face_index; + std::uint32_t reserved0; + std::uint32_t reserved1; + std::uint32_t reserved2; +}; + +struct gpu_direct_phase_face_range_host_t { + std::uint32_t source_begin; + std::uint32_t source_count; +}; + +struct gpu_direct_phase_source_host_t { + float px, py, pz, light; + float dx, dy, dz, dist; + float cr, cg, cb, atten; + std::uint32_t type; + std::uint32_t formula; + std::uint32_t flags; + std::uint32_t reserved0; + float anglescale; + float dirt; + float falloff; + float pad0; +}; +struct push_constants_t { + std::uint32_t ray_count; + std::uint32_t flags; +}; + +struct direct_push_constants_t { + std::uint32_t sample_count; + std::uint32_t source_count; + std::uint32_t flags; + std::uint32_t reserved0; +}; + +static_assert(sizeof(gpu_ray_host_t) == 40, "GPU ray layout must match shader"); +static_assert(sizeof(gpu_result_host_t) == 20, "GPU result layout must match shader"); +static_assert(sizeof(gpu_direct_job_host_t) == 80, "GPU direct job layout must match shader"); +static_assert(sizeof(gpu_direct_range_host_t) == 8, "GPU direct range layout must match shader"); +static_assert(sizeof(gpu_direct_accum_host_t) == 48, "GPU direct accum layout must match shader"); +static_assert(sizeof(gpu_direct_phase_sample_host_t) == 48, "GPU direct phase sample layout must match shader"); +static_assert(sizeof(gpu_direct_phase_face_range_host_t) == 8, "GPU direct phase face range layout must match shader"); +static_assert(sizeof(gpu_direct_phase_source_host_t) == 80, "GPU direct phase source layout must match shader"); + +struct context_t { + VkInstance instance = VK_NULL_HANDLE; + VkPhysicalDevice physical = VK_NULL_HANDLE; + VkDevice device = VK_NULL_HANDLE; + VkQueue queue = VK_NULL_HANDLE; + std::uint32_t queue_family = 0; + + VkPhysicalDeviceMemoryProperties memory_props{}; + + VkCommandPool command_pool = VK_NULL_HANDLE; + VkCommandBuffer command_buffer = VK_NULL_HANDLE; + + PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR_ = nullptr; + PFN_vkCreateAccelerationStructureKHR vkCreateAccelerationStructureKHR_ = nullptr; + PFN_vkDestroyAccelerationStructureKHR vkDestroyAccelerationStructureKHR_ = nullptr; + PFN_vkGetAccelerationStructureBuildSizesKHR vkGetAccelerationStructureBuildSizesKHR_ = nullptr; + PFN_vkCmdBuildAccelerationStructuresKHR vkCmdBuildAccelerationStructuresKHR_ = nullptr; + PFN_vkGetAccelerationStructureDeviceAddressKHR vkGetAccelerationStructureDeviceAddressKHR_ = nullptr; + PFN_vkCmdWriteAccelerationStructuresPropertiesKHR vkCmdWriteAccelerationStructuresPropertiesKHR_ = nullptr; + + buffer_t vertices; + buffer_t indices; + buffer_t instances; + as_t blas; + as_t tlas; + + VkDescriptorSetLayout descriptor_set_layout = VK_NULL_HANDLE; + VkPipelineLayout pipeline_layout = VK_NULL_HANDLE; + VkPipeline pipeline = VK_NULL_HANDLE; + VkDescriptorPool descriptor_pool = VK_NULL_HANDLE; + VkDescriptorSet descriptor_set = VK_NULL_HANDLE; + + VkDescriptorSetLayout direct_descriptor_set_layout = VK_NULL_HANDLE; + VkPipelineLayout direct_pipeline_layout = VK_NULL_HANDLE; + VkPipeline direct_pipeline = VK_NULL_HANDLE; + VkDescriptorPool direct_descriptor_pool = VK_NULL_HANDLE; + VkDescriptorSet direct_descriptor_set = VK_NULL_HANDLE; + + std::size_t triangle_count = 0; + bool has_filtered_embree_geometry = false; +}; + +std::mutex g_mutex; +context_t g; + +static std::string vk_result_string(VkResult r) { + switch (r) { + case VK_SUCCESS: return "VK_SUCCESS"; + case VK_NOT_READY: return "VK_NOT_READY"; + case VK_TIMEOUT: return "VK_TIMEOUT"; + case VK_EVENT_SET: return "VK_EVENT_SET"; + case VK_EVENT_RESET: return "VK_EVENT_RESET"; + case VK_INCOMPLETE: return "VK_INCOMPLETE"; + case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY"; + case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED"; + case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST"; + case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED"; + case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT"; + case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT"; + case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT"; + case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER"; + default: return "VkResult(" + std::to_string(static_cast(r)) + ")"; + } +} + +static bool check(VkResult r, const char *what, std::string &error) { + if (r == VK_SUCCESS) return true; + error = std::string(what) + " failed: " + vk_result_string(r); + return false; +} + +static bool has_extension(const std::vector &props, const char *name) { + return std::any_of(props.begin(), props.end(), [&](const VkExtensionProperties &p) { + return std::strcmp(p.extensionName, name) == 0; + }); +} + +static void destroy_buffer(buffer_t &b) { + if (b.buffer) vkDestroyBuffer(g.device, b.buffer, nullptr); + if (b.memory) vkFreeMemory(g.device, b.memory, nullptr); + b = {}; +} + +static void destroy_as(as_t &a) { + if (a.as) g.vkDestroyAccelerationStructureKHR_(g.device, a.as, nullptr); + destroy_buffer(a.storage); + a = {}; +} + +static void destroy_locked() { + if (g.device) vkDeviceWaitIdle(g.device); + + if (g.direct_pipeline) vkDestroyPipeline(g.device, g.direct_pipeline, nullptr); + if (g.direct_pipeline_layout) vkDestroyPipelineLayout(g.device, g.direct_pipeline_layout, nullptr); + if (g.direct_descriptor_pool) vkDestroyDescriptorPool(g.device, g.direct_descriptor_pool, nullptr); + if (g.direct_descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.direct_descriptor_set_layout, nullptr); + + if (g.pipeline) vkDestroyPipeline(g.device, g.pipeline, nullptr); + if (g.pipeline_layout) vkDestroyPipelineLayout(g.device, g.pipeline_layout, nullptr); + if (g.descriptor_pool) vkDestroyDescriptorPool(g.device, g.descriptor_pool, nullptr); + if (g.descriptor_set_layout) vkDestroyDescriptorSetLayout(g.device, g.descriptor_set_layout, nullptr); + + destroy_as(g.tlas); + destroy_as(g.blas); + destroy_buffer(g.instances); + destroy_buffer(g.indices); + destroy_buffer(g.vertices); + + if (g.command_pool) vkDestroyCommandPool(g.device, g.command_pool, nullptr); + if (g.device) vkDestroyDevice(g.device, nullptr); + if (g.instance) vkDestroyInstance(g.instance, nullptr); + g = {}; +} + +static bool find_memory_type(std::uint32_t type_bits, VkMemoryPropertyFlags props, std::uint32_t &type_index) { + for (std::uint32_t i = 0; i < g.memory_props.memoryTypeCount; ++i) { + if ((type_bits & (1u << i)) && ((g.memory_props.memoryTypes[i].propertyFlags & props) == props)) { + type_index = i; + return true; + } + } + return false; +} + +static bool create_buffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags props, buffer_t &out, + std::string &error, const void *initial_data = nullptr) { + out = {}; + out.size = size; + + VkBufferCreateInfo bi{}; + bi.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bi.size = size; + bi.usage = usage; + bi.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + if (!check(vkCreateBuffer(g.device, &bi, nullptr, &out.buffer), "vkCreateBuffer", error)) return false; + + VkMemoryRequirements req{}; + vkGetBufferMemoryRequirements(g.device, out.buffer, &req); + + std::uint32_t mem_type = 0; + if (!find_memory_type(req.memoryTypeBits, props, mem_type)) { + error = "no compatible Vulkan memory type for buffer"; + destroy_buffer(out); + return false; + } + + VkMemoryAllocateFlagsInfo flags{}; + flags.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO; + flags.flags = (usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) ? VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT : 0; + + VkMemoryAllocateInfo ai{}; + ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + ai.pNext = flags.flags ? &flags : nullptr; + ai.allocationSize = req.size; + ai.memoryTypeIndex = mem_type; + + if (!check(vkAllocateMemory(g.device, &ai, nullptr, &out.memory), "vkAllocateMemory", error)) { + destroy_buffer(out); + return false; + } + if (!check(vkBindBufferMemory(g.device, out.buffer, out.memory, 0), "vkBindBufferMemory", error)) { + destroy_buffer(out); + return false; + } + + if (initial_data) { + void *mapped = nullptr; + if (!check(vkMapMemory(g.device, out.memory, 0, size, 0, &mapped), "vkMapMemory", error)) { + destroy_buffer(out); + return false; + } + std::memcpy(mapped, initial_data, static_cast(size)); + vkUnmapMemory(g.device, out.memory); + } + + return true; +} + +static VkDeviceAddress buffer_address(const buffer_t &b) { + VkBufferDeviceAddressInfo info{}; + info.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO; + info.buffer = b.buffer; + return g.vkGetBufferDeviceAddressKHR_(g.device, &info); +} + +static bool one_time_submit(const std::function &record, std::string &error) { + if (!check(vkResetCommandBuffer(g.command_buffer, 0), "vkResetCommandBuffer", error)) return false; + + VkCommandBufferBeginInfo bi{}; + bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + if (!check(vkBeginCommandBuffer(g.command_buffer, &bi), "vkBeginCommandBuffer", error)) return false; + record(g.command_buffer); + if (!check(vkEndCommandBuffer(g.command_buffer), "vkEndCommandBuffer", error)) return false; + + VkSubmitInfo si{}; + si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + si.commandBufferCount = 1; + si.pCommandBuffers = &g.command_buffer; + if (!check(vkQueueSubmit(g.queue, 1, &si, VK_NULL_HANDLE), "vkQueueSubmit", error)) return false; + if (!check(vkQueueWaitIdle(g.queue), "vkQueueWaitIdle", error)) return false; + return true; +} + +static bool create_instance(std::string &error) { + VkApplicationInfo app{}; + app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + app.pApplicationName = "ericw-tools light gpu"; + app.applicationVersion = VK_MAKE_VERSION(0, 2, 0); + app.pEngineName = "ericw-tools"; + app.engineVersion = VK_MAKE_VERSION(0, 2, 0); + app.apiVersion = VK_API_VERSION_1_2; + + VkInstanceCreateInfo ci{}; + ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + ci.pApplicationInfo = &app; + return check(vkCreateInstance(&ci, nullptr, &g.instance), "vkCreateInstance", error); +} + +static bool pick_device(std::string &error) { + std::uint32_t count = 0; + if (!check(vkEnumeratePhysicalDevices(g.instance, &count, nullptr), "vkEnumeratePhysicalDevices(count)", error)) return false; + if (!count) { error = "no Vulkan physical devices found"; return false; } + + std::vector devices(count); + if (!check(vkEnumeratePhysicalDevices(g.instance, &count, devices.data()), "vkEnumeratePhysicalDevices(list)", error)) return false; + + for (VkPhysicalDevice dev : devices) { + std::uint32_t ext_count = 0; + vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, nullptr); + std::vector exts(ext_count); + vkEnumerateDeviceExtensionProperties(dev, nullptr, &ext_count, exts.data()); + + if (!has_extension(exts, VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_RAY_QUERY_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME) || + !has_extension(exts, VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME)) { + continue; + } + + VkPhysicalDeviceBufferDeviceAddressFeatures bda{}; + bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES; + VkPhysicalDeviceRayQueryFeaturesKHR rq{}; + rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR; + rq.pNext = &bda; + VkPhysicalDeviceAccelerationStructureFeaturesKHR as{}; + as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR; + as.pNext = &rq; + VkPhysicalDeviceFeatures2 f2{}; + f2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + f2.pNext = &as; + vkGetPhysicalDeviceFeatures2(dev, &f2); + if (!as.accelerationStructure || !rq.rayQuery || !bda.bufferDeviceAddress) continue; + + std::uint32_t q_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, nullptr); + std::vector qs(q_count); + vkGetPhysicalDeviceQueueFamilyProperties(dev, &q_count, qs.data()); + for (std::uint32_t i = 0; i < q_count; ++i) { + if (qs[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { + g.physical = dev; + g.queue_family = i; + vkGetPhysicalDeviceMemoryProperties(dev, &g.memory_props); + return true; + } + } + } + + error = "no Vulkan device with acceleration_structure + ray_query + buffer_device_address + compute queue found"; + return false; +} + +static bool create_device(std::string &error) { + float priority = 1.0f; + VkDeviceQueueCreateInfo qci{}; + qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + qci.queueFamilyIndex = g.queue_family; + qci.queueCount = 1; + qci.pQueuePriorities = &priority; + + VkPhysicalDeviceBufferDeviceAddressFeatures bda{}; + bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES; + bda.bufferDeviceAddress = VK_TRUE; + + VkPhysicalDeviceRayQueryFeaturesKHR rq{}; + rq.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR; + rq.rayQuery = VK_TRUE; + rq.pNext = &bda; + + VkPhysicalDeviceAccelerationStructureFeaturesKHR as{}; + as.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR; + as.accelerationStructure = VK_TRUE; + as.pNext = &rq; + + const char *extensions[] = { + VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME, + VK_KHR_RAY_QUERY_EXTENSION_NAME, + VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME, + VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME, + }; + + VkDeviceCreateInfo dci{}; + dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + dci.pNext = &as; + dci.queueCreateInfoCount = 1; + dci.pQueueCreateInfos = &qci; + dci.enabledExtensionCount = static_cast(sizeof(extensions) / sizeof(extensions[0])); + dci.ppEnabledExtensionNames = extensions; + + if (!check(vkCreateDevice(g.physical, &dci, nullptr, &g.device), "vkCreateDevice", error)) return false; + vkGetDeviceQueue(g.device, g.queue_family, 0, &g.queue); + +#define LOAD_DEVICE_PROC(name) \ + g.name##_ = reinterpret_cast(vkGetDeviceProcAddr(g.device, #name)); \ + if (!g.name##_) { error = "missing device proc " #name; return false; } + LOAD_DEVICE_PROC(vkGetBufferDeviceAddressKHR); + LOAD_DEVICE_PROC(vkCreateAccelerationStructureKHR); + LOAD_DEVICE_PROC(vkDestroyAccelerationStructureKHR); + LOAD_DEVICE_PROC(vkGetAccelerationStructureBuildSizesKHR); + LOAD_DEVICE_PROC(vkCmdBuildAccelerationStructuresKHR); + LOAD_DEVICE_PROC(vkGetAccelerationStructureDeviceAddressKHR); +#undef LOAD_DEVICE_PROC + + VkCommandPoolCreateInfo pci{}; + pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + pci.queueFamilyIndex = g.queue_family; + pci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + if (!check(vkCreateCommandPool(g.device, &pci, nullptr, &g.command_pool), "vkCreateCommandPool", error)) return false; + + VkCommandBufferAllocateInfo cai{}; + cai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + cai.commandPool = g.command_pool; + cai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + cai.commandBufferCount = 1; + if (!check(vkAllocateCommandBuffers(g.device, &cai, &g.command_buffer), "vkAllocateCommandBuffers", error)) return false; + + return true; +} + +static bool gather_geometry(const mbsp_t *bsp, std::vector &vertices, std::vector &indices, std::string &error) { + vertices.clear(); + indices.clear(); + + const auto &faces = ShadowCastingSolidFacesSet(); + if (faces.empty()) { + error = "no shadow-casting solid faces found for GPU BLAS; call Embree_TraceInit before GPU_TraceInit"; + return false; + } + + for (const mface_t *face : faces) { + if (!face || face->numedges < 3) continue; + const modelinfo_t *modelinfo = ModelInfoForFace(bsp, Face_GetNum(bsp, face)); + if (!modelinfo) continue; + + for (int j = 2; j < face->numedges; ++j) { + const int v0 = Face_VertexAtIndex(bsp, face, j - 1); + const int v1 = Face_VertexAtIndex(bsp, face, j); + const int v2 = Face_VertexAtIndex(bsp, face, 0); + const qvec3f p0 = Vertex_GetPos(bsp, v0) + modelinfo->offset; + const qvec3f p1 = Vertex_GetPos(bsp, v1) + modelinfo->offset; + const qvec3f p2 = Vertex_GetPos(bsp, v2) + modelinfo->offset; + + const std::uint32_t base = static_cast(vertices.size()); + vertices.push_back({p0[0], p0[1], p0[2]}); + vertices.push_back({p1[0], p1[1], p1[2]}); + vertices.push_back({p2[0], p2[1], p2[2]}); + indices.push_back(base + 0); + indices.push_back(base + 1); + indices.push_back(base + 2); + } + } + + if (indices.empty()) { + error = "GPU geometry gather produced zero triangles"; + return false; + } + return true; +} + +static bool create_acceleration_structure(VkAccelerationStructureTypeKHR type, VkDeviceSize size, as_t &out, std::string &error) { + if (!create_buffer(size, + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + out.storage, + error)) return false; + + VkAccelerationStructureCreateInfoKHR ci{}; + ci.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR; + ci.type = type; + ci.size = size; + ci.buffer = out.storage.buffer; + if (!check(g.vkCreateAccelerationStructureKHR_(g.device, &ci, nullptr, &out.as), "vkCreateAccelerationStructureKHR", error)) return false; + + VkAccelerationStructureDeviceAddressInfoKHR ai{}; + ai.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_DEVICE_ADDRESS_INFO_KHR; + ai.accelerationStructure = out.as; + out.address = g.vkGetAccelerationStructureDeviceAddressKHR_(g.device, &ai); + return true; +} + +static bool build_blas(const std::vector &vertices, const std::vector &indices, std::string &error) { + if (!create_buffer(sizeof(vertex_t) * vertices.size(), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.vertices, + error, + vertices.data())) return false; + + if (!create_buffer(sizeof(std::uint32_t) * indices.size(), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.indices, + error, + indices.data())) return false; + + VkDeviceAddress vertex_addr = buffer_address(g.vertices); + VkDeviceAddress index_addr = buffer_address(g.indices); + + VkAccelerationStructureGeometryKHR geom{}; + geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR; + geom.geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR; + geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR; + geom.geometry.triangles.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_TRIANGLES_DATA_KHR; + geom.geometry.triangles.vertexFormat = VK_FORMAT_R32G32B32_SFLOAT; + geom.geometry.triangles.vertexData.deviceAddress = vertex_addr; + geom.geometry.triangles.vertexStride = sizeof(vertex_t); + geom.geometry.triangles.maxVertex = static_cast(vertices.size() - 1); + geom.geometry.triangles.indexType = VK_INDEX_TYPE_UINT32; + geom.geometry.triangles.indexData.deviceAddress = index_addr; + + const std::uint32_t prim_count = static_cast(indices.size() / 3); + g.triangle_count = prim_count; + + VkAccelerationStructureBuildGeometryInfoKHR build{}; + build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR; + build.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; + build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; + build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; + build.geometryCount = 1; + build.pGeometries = &geom; + + VkAccelerationStructureBuildSizesInfoKHR sizes{}; + sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR; + g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes); + + if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR, sizes.accelerationStructureSize, g.blas, error)) return false; + + buffer_t scratch; + if (!create_buffer(sizes.buildScratchSize, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + scratch, + error)) return false; + + build.dstAccelerationStructure = g.blas.as; + build.scratchData.deviceAddress = buffer_address(scratch); + + VkAccelerationStructureBuildRangeInfoKHR range{}; + range.primitiveCount = prim_count; + const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = ⦥ + + bool ok = one_time_submit([&](VkCommandBuffer cmd) { + g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr); + }, error); + + destroy_buffer(scratch); + return ok; +} + +static bool build_tlas(std::string &error) { + VkAccelerationStructureInstanceKHR inst{}; + inst.transform.matrix[0][0] = 1.0f; + inst.transform.matrix[1][1] = 1.0f; + inst.transform.matrix[2][2] = 1.0f; + inst.instanceCustomIndex = 0; + inst.mask = 0xff; + inst.instanceShaderBindingTableRecordOffset = 0; + inst.flags = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR; + inst.accelerationStructureReference = g.blas.address; + + if (!create_buffer(sizeof(inst), + VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + g.instances, + error, + &inst)) return false; + + VkAccelerationStructureGeometryKHR geom{}; + geom.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR; + geom.geometryType = VK_GEOMETRY_TYPE_INSTANCES_KHR; + geom.flags = VK_GEOMETRY_OPAQUE_BIT_KHR; + geom.geometry.instances.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR; + geom.geometry.instances.arrayOfPointers = VK_FALSE; + geom.geometry.instances.data.deviceAddress = buffer_address(g.instances); + + const std::uint32_t prim_count = 1; + + VkAccelerationStructureBuildGeometryInfoKHR build{}; + build.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR; + build.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR; + build.flags = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; + build.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; + build.geometryCount = 1; + build.pGeometries = &geom; + + VkAccelerationStructureBuildSizesInfoKHR sizes{}; + sizes.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR; + g.vkGetAccelerationStructureBuildSizesKHR_(g.device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &build, &prim_count, &sizes); + + if (!create_acceleration_structure(VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR, sizes.accelerationStructureSize, g.tlas, error)) return false; + + buffer_t scratch; + if (!create_buffer(sizes.buildScratchSize, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + scratch, + error)) return false; + + build.dstAccelerationStructure = g.tlas.as; + build.scratchData.deviceAddress = buffer_address(scratch); + + VkAccelerationStructureBuildRangeInfoKHR range{}; + range.primitiveCount = prim_count; + const VkAccelerationStructureBuildRangeInfoKHR *range_ptr = ⦥ + + bool ok = one_time_submit([&](VkCommandBuffer cmd) { + g.vkCmdBuildAccelerationStructuresKHR_(cmd, 1, &build, &range_ptr); + }, error); + + destroy_buffer(scratch); + return ok; +} + +static std::filesystem::path exe_dir() { +#if defined(__linux__) + std::array buf{}; + ssize_t len = readlink("/proc/self/exe", buf.data(), buf.size() - 1); + if (len > 0) { + buf[static_cast(len)] = '\0'; + return std::filesystem::path(buf.data()).parent_path(); + } +#endif + return std::filesystem::current_path(); +} + +static bool read_file(const std::filesystem::path &path, std::vector &words, std::string &error) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { error = "could not open shader: " + path.string(); return false; } + const std::streamsize size = f.tellg(); + if (size <= 0 || (size % 4) != 0) { error = "shader has invalid SPIR-V size: " + path.string(); return false; } + f.seekg(0, std::ios::beg); + words.resize(static_cast(size / 4)); + if (!f.read(reinterpret_cast(words.data()), size)) { error = "failed to read shader: " + path.string(); return false; } + return true; +} + +static bool create_pipeline(std::string &error) { + VkDescriptorSetLayoutBinding b0{}; + b0.binding = 0; + b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + b0.descriptorCount = 1; + b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b1{}; + b1.binding = 1; + b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b1.descriptorCount = 1; + b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b2{}; + b2.binding = 2; + b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b2.descriptorCount = 1; + b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + std::array bindings{b0, b1, b2}; + VkDescriptorSetLayoutCreateInfo dlci{}; + dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + dlci.bindingCount = static_cast(bindings.size()); + dlci.pBindings = bindings.data(); + if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.descriptor_set_layout), "vkCreateDescriptorSetLayout", error)) return false; + + VkPushConstantRange pcr{}; + pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pcr.offset = 0; + pcr.size = sizeof(push_constants_t); + + VkPipelineLayoutCreateInfo plci{}; + plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + plci.setLayoutCount = 1; + plci.pSetLayouts = &g.descriptor_set_layout; + plci.pushConstantRangeCount = 1; + plci.pPushConstantRanges = &pcr; + if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.pipeline_layout), "vkCreatePipelineLayout", error)) return false; + + std::vector spv; + const auto shader_path = exe_dir() / "gpu_shaders" / "occlusion.comp.spv"; + if (!read_file(shader_path, spv, error)) return false; + + VkShaderModuleCreateInfo smci{}; + smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + smci.codeSize = spv.size() * sizeof(std::uint32_t); + smci.pCode = spv.data(); + VkShaderModule shader = VK_NULL_HANDLE; + if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule", error)) return false; + + VkComputePipelineCreateInfo cpci{}; + cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + cpci.stage.module = shader; + cpci.stage.pName = "main"; + cpci.layout = g.pipeline_layout; + bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.pipeline), "vkCreateComputePipelines", error); + vkDestroyShaderModule(g.device, shader, nullptr); + if (!ok) return false; + + VkDescriptorPoolSize ps0{}; + ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + ps0.descriptorCount = 1; + VkDescriptorPoolSize ps1{}; + ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + ps1.descriptorCount = 2; + std::array sizes{ps0, ps1}; + + VkDescriptorPoolCreateInfo dpci{}; + dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + dpci.maxSets = 1; + dpci.poolSizeCount = static_cast(sizes.size()); + dpci.pPoolSizes = sizes.data(); + if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.descriptor_pool), "vkCreateDescriptorPool", error)) return false; + + VkDescriptorSetAllocateInfo dsai{}; + dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsai.descriptorPool = g.descriptor_pool; + dsai.descriptorSetCount = 1; + dsai.pSetLayouts = &g.descriptor_set_layout; + if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.descriptor_set), "vkAllocateDescriptorSets", error)) return false; + + return true; +} + +static bool create_direct_pipeline(std::string &error) { + VkDescriptorSetLayoutBinding b0{}; + b0.binding = 0; + b0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + b0.descriptorCount = 1; + b0.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b1{}; + b1.binding = 1; + b1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b1.descriptorCount = 1; + b1.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b2{}; + b2.binding = 2; + b2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b2.descriptorCount = 1; + b2.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b3{}; + b3.binding = 3; + b3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b3.descriptorCount = 1; + b3.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b4{}; + b4.binding = 4; + b4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b4.descriptorCount = 1; + b4.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + VkDescriptorSetLayoutBinding b5{}; + b5.binding = 5; + b5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + b5.descriptorCount = 1; + b5.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + + std::array bindings{b0, b1, b2, b3, b4, b5}; + VkDescriptorSetLayoutCreateInfo dlci{}; + dlci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + dlci.bindingCount = static_cast(bindings.size()); + dlci.pBindings = bindings.data(); + if (!check(vkCreateDescriptorSetLayout(g.device, &dlci, nullptr, &g.direct_descriptor_set_layout), "vkCreateDescriptorSetLayout(direct)", error)) return false; + + VkPushConstantRange pcr{}; + pcr.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pcr.offset = 0; + pcr.size = sizeof(direct_push_constants_t); + + VkPipelineLayoutCreateInfo plci{}; + plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + plci.setLayoutCount = 1; + plci.pSetLayouts = &g.direct_descriptor_set_layout; + plci.pushConstantRangeCount = 1; + plci.pPushConstantRanges = &pcr; + if (!check(vkCreatePipelineLayout(g.device, &plci, nullptr, &g.direct_pipeline_layout), "vkCreatePipelineLayout(direct)", error)) return false; + + std::vector spv; + const auto shader_path = exe_dir() / "gpu_shaders" / "direct_phase.comp.spv"; + if (!read_file(shader_path, spv, error)) return false; + + VkShaderModuleCreateInfo smci{}; + smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + smci.codeSize = spv.size() * sizeof(std::uint32_t); + smci.pCode = spv.data(); + VkShaderModule shader = VK_NULL_HANDLE; + if (!check(vkCreateShaderModule(g.device, &smci, nullptr, &shader), "vkCreateShaderModule(direct)", error)) return false; + + VkComputePipelineCreateInfo cpci{}; + cpci.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + cpci.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; + cpci.stage.module = shader; + cpci.stage.pName = "main"; + cpci.layout = g.direct_pipeline_layout; + bool ok = check(vkCreateComputePipelines(g.device, VK_NULL_HANDLE, 1, &cpci, nullptr, &g.direct_pipeline), "vkCreateComputePipelines(direct)", error); + vkDestroyShaderModule(g.device, shader, nullptr); + if (!ok) return false; + + VkDescriptorPoolSize ps0{}; + ps0.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + ps0.descriptorCount = 1; + VkDescriptorPoolSize ps1{}; + ps1.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + ps1.descriptorCount = 5; + std::array sizes{ps0, ps1}; + + VkDescriptorPoolCreateInfo dpci{}; + dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + dpci.maxSets = 1; + dpci.poolSizeCount = static_cast(sizes.size()); + dpci.pPoolSizes = sizes.data(); + if (!check(vkCreateDescriptorPool(g.device, &dpci, nullptr, &g.direct_descriptor_pool), "vkCreateDescriptorPool(direct)", error)) return false; + + VkDescriptorSetAllocateInfo dsai{}; + dsai.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + dsai.descriptorPool = g.direct_descriptor_pool; + dsai.descriptorSetCount = 1; + dsai.pSetLayouts = &g.direct_descriptor_set_layout; + if (!check(vkAllocateDescriptorSets(g.device, &dsai, &g.direct_descriptor_set), "vkAllocateDescriptorSets(direct)", error)) return false; + + return true; +} + +static void update_direct_descriptor_set( + const buffer_t &sample_buffer, + const buffer_t &source_buffer, + const buffer_t &face_range_buffer, + const buffer_t &face_source_index_buffer, + const buffer_t &accum_buffer) { + VkWriteDescriptorSetAccelerationStructureKHR as_info{}; + as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; + as_info.accelerationStructureCount = 1; + as_info.pAccelerationStructures = &g.tlas.as; + + VkWriteDescriptorSet w0{}; + w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w0.pNext = &as_info; + w0.dstSet = g.direct_descriptor_set; + w0.dstBinding = 0; + w0.descriptorCount = 1; + w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + + VkDescriptorBufferInfo sample_info{}; + sample_info.buffer = sample_buffer.buffer; + sample_info.offset = 0; + sample_info.range = sample_buffer.size; + VkWriteDescriptorSet w1{}; + w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w1.dstSet = g.direct_descriptor_set; + w1.dstBinding = 1; + w1.descriptorCount = 1; + w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w1.pBufferInfo = &sample_info; + + VkDescriptorBufferInfo source_info{}; + source_info.buffer = source_buffer.buffer; + source_info.offset = 0; + source_info.range = source_buffer.size; + VkWriteDescriptorSet w2{}; + w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w2.dstSet = g.direct_descriptor_set; + w2.dstBinding = 2; + w2.descriptorCount = 1; + w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w2.pBufferInfo = &source_info; + + VkDescriptorBufferInfo face_range_info{}; + face_range_info.buffer = face_range_buffer.buffer; + face_range_info.offset = 0; + face_range_info.range = face_range_buffer.size; + VkWriteDescriptorSet w3{}; + w3.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w3.dstSet = g.direct_descriptor_set; + w3.dstBinding = 3; + w3.descriptorCount = 1; + w3.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w3.pBufferInfo = &face_range_info; + + VkDescriptorBufferInfo face_source_index_info{}; + face_source_index_info.buffer = face_source_index_buffer.buffer; + face_source_index_info.offset = 0; + face_source_index_info.range = face_source_index_buffer.size; + VkWriteDescriptorSet w4{}; + w4.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w4.dstSet = g.direct_descriptor_set; + w4.dstBinding = 4; + w4.descriptorCount = 1; + w4.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w4.pBufferInfo = &face_source_index_info; + + VkDescriptorBufferInfo accum_info{}; + accum_info.buffer = accum_buffer.buffer; + accum_info.offset = 0; + accum_info.range = accum_buffer.size; + VkWriteDescriptorSet w5{}; + w5.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w5.dstSet = g.direct_descriptor_set; + w5.dstBinding = 5; + w5.descriptorCount = 1; + w5.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w5.pBufferInfo = &accum_info; + + std::array writes{w0, w1, w2, w3, w4, w5}; + vkUpdateDescriptorSets(g.device, static_cast(writes.size()), writes.data(), 0, nullptr); +} + +static void update_descriptor_set(const buffer_t &ray_buffer, const buffer_t &result_buffer) { + VkWriteDescriptorSetAccelerationStructureKHR as_info{}; + as_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR; + as_info.accelerationStructureCount = 1; + as_info.pAccelerationStructures = &g.tlas.as; + + VkWriteDescriptorSet w0{}; + w0.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w0.pNext = &as_info; + w0.dstSet = g.descriptor_set; + w0.dstBinding = 0; + w0.descriptorCount = 1; + w0.descriptorType = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; + + VkDescriptorBufferInfo ray_info{}; + ray_info.buffer = ray_buffer.buffer; + ray_info.offset = 0; + ray_info.range = ray_buffer.size; + VkWriteDescriptorSet w1{}; + w1.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w1.dstSet = g.descriptor_set; + w1.dstBinding = 1; + w1.descriptorCount = 1; + w1.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w1.pBufferInfo = &ray_info; + + VkDescriptorBufferInfo result_info{}; + result_info.buffer = result_buffer.buffer; + result_info.offset = 0; + result_info.range = result_buffer.size; + VkWriteDescriptorSet w2{}; + w2.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + w2.dstSet = g.descriptor_set; + w2.dstBinding = 2; + w2.descriptorCount = 1; + w2.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + w2.pBufferInfo = &result_info; + + std::array writes{w0, w1, w2}; + vkUpdateDescriptorSets(g.device, static_cast(writes.size()), writes.data(), 0, nullptr); +} + +} // namespace + +bool init(const mbsp_t *bsp, std::string &error) { + std::lock_guard lock(g_mutex); + destroy_locked(); + + if (!create_instance(error)) { destroy_locked(); return false; } + if (!pick_device(error)) { destroy_locked(); return false; } + if (!create_device(error)) { destroy_locked(); return false; } + + g.has_filtered_embree_geometry = !filtergeom.triInfo.empty(); + if (g.has_filtered_embree_geometry) { + logging::print("GPU light: filtered Embree geometry exists ({} tris); GPU will fall back for correctness.\n", filtergeom.triInfo.size()); + } + + std::vector vertices; + std::vector indices; + if (!gather_geometry(bsp, vertices, indices, error)) { destroy_locked(); return false; } + if (!build_blas(vertices, indices, error)) { destroy_locked(); return false; } + if (!build_tlas(error)) { destroy_locked(); return false; } + if (!create_pipeline(error)) { destroy_locked(); return false; } + if (!create_direct_pipeline(error)) { destroy_locked(); return false; } + + logging::print("GPU light: Vulkan ray-query BLAS/TLAS ready ({} opaque triangles).\n", g.triangle_count); + return true; +} + +void shutdown() { + std::lock_guard lock(g_mutex); + destroy_locked(); +} + +bool trace_occlusion_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::ray_t *rays, + gpu_light::occlusion_result_t *results, + std::size_t count, + std::string &error) { + std::lock_guard lock(g_mutex); + if (!g.device || !g.pipeline || !g.tlas.as) { + error = "Vulkan GPU backend is not initialized"; + return false; + } + + // Correctness guard: the GPU fast path only contains opaque solid/default geometry. + // If Embree has filtered geometry, let CPU handle batches so glass/fence/dynamic/channel filters remain correct. + if (g.has_filtered_embree_geometry) { + return false; + } + if (shadow_mask != CHANNEL_MASK_DEFAULT) { + return false; + } + (void)self; + + std::vector gpu_rays(count); + for (std::size_t i = 0; i < count; ++i) { + gpu_rays[i].ox = rays[i].origin[0]; + gpu_rays[i].oy = rays[i].origin[1]; + gpu_rays[i].oz = rays[i].origin[2]; + gpu_rays[i].tmin = rays[i].tmin; + gpu_rays[i].dx = rays[i].direction[0]; + gpu_rays[i].dy = rays[i].direction[1]; + gpu_rays[i].dz = rays[i].direction[2]; + gpu_rays[i].tmax = rays[i].tmax; + gpu_rays[i].shadow_mask = rays[i].shadow_mask; + gpu_rays[i].user_index = rays[i].user_index; + } + + buffer_t ray_buffer; + buffer_t result_buffer; + std::vector zero_results(count); + + bool ok = create_buffer(sizeof(gpu_ray_host_t) * count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + ray_buffer, + error, + gpu_rays.data()); + if (!ok) return false; + + ok = create_buffer(sizeof(gpu_result_host_t) * count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + result_buffer, + error, + zero_results.data()); + if (!ok) { + destroy_buffer(ray_buffer); + return false; + } + + update_descriptor_set(ray_buffer, result_buffer); + + push_constants_t pc{}; + pc.ray_count = static_cast(count); + pc.flags = 0; + + ok = one_time_submit([&](VkCommandBuffer cmd) { + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.pipeline_layout, 0, 1, &g.descriptor_set, 0, nullptr); + vkCmdPushConstants(cmd, g.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cmd, (pc.ray_count + 127u) / 128u, 1, 1); + }, error); + + if (ok) { + void *mapped = nullptr; + ok = check(vkMapMemory(g.device, result_buffer.memory, 0, result_buffer.size, 0, &mapped), "vkMapMemory(result)", error); + if (ok) { + const auto *gpu_results = static_cast(mapped); + for (std::size_t i = 0; i < count; ++i) { + results[i].occluded = gpu_results[i].occluded; + results[i].reserved0 = gpu_results[i].reserved0; + results[i].transmittance[0] = gpu_results[i].tr; + results[i].transmittance[1] = gpu_results[i].tg; + results[i].transmittance[2] = gpu_results[i].tb; + } + vkUnmapMemory(g.device, result_buffer.memory); + } + } + + destroy_buffer(result_buffer); + destroy_buffer(ray_buffer); + return ok; +} + + +bool trace_direct_phase_batch( + const gpu_light::direct_phase_source_t *sources, + std::size_t source_count, + const gpu_light::direct_phase_sample_t *samples, + gpu_light::direct_phase_accum_t *accum, + std::size_t sample_count, + const gpu_light::direct_phase_face_range_t *face_ranges, + std::size_t face_range_count, + const std::uint32_t *face_source_indices, + std::size_t face_source_index_count, + std::string &error) { + std::lock_guard lock(g_mutex); + if (!g.device || !g.direct_pipeline || !g.tlas.as) { + error = "Vulkan GPU direct phase backend is not initialized"; + return false; + } + if (g.has_filtered_embree_geometry) { + return false; + } + if (!sources || !samples || !accum || !face_ranges || !face_source_indices || source_count == 0 || sample_count == 0 || face_range_count == 0 || face_source_index_count == 0) { + return true; + } + + std::vector gpu_samples(sample_count); + for (std::size_t i = 0; i < sample_count; ++i) { + gpu_samples[i].px = samples[i].px; + gpu_samples[i].py = samples[i].py; + gpu_samples[i].pz = samples[i].pz; + gpu_samples[i].occlusion = samples[i].occlusion; + gpu_samples[i].nx = samples[i].nx; + gpu_samples[i].ny = samples[i].ny; + gpu_samples[i].nz = samples[i].nz; + gpu_samples[i].twosided = samples[i].twosided; + gpu_samples[i].face_index = samples[i].face_index; + gpu_samples[i].reserved0 = 0; + gpu_samples[i].reserved1 = 0; + gpu_samples[i].reserved2 = 0; + } + + std::vector gpu_sources(source_count); + for (std::size_t i = 0; i < source_count; ++i) { + gpu_sources[i].px = sources[i].px; + gpu_sources[i].py = sources[i].py; + gpu_sources[i].pz = sources[i].pz; + gpu_sources[i].light = sources[i].light; + gpu_sources[i].dx = sources[i].dx; + gpu_sources[i].dy = sources[i].dy; + gpu_sources[i].dz = sources[i].dz; + gpu_sources[i].dist = sources[i].dist; + gpu_sources[i].cr = sources[i].cr; + gpu_sources[i].cg = sources[i].cg; + gpu_sources[i].cb = sources[i].cb; + gpu_sources[i].atten = sources[i].atten; + gpu_sources[i].type = sources[i].type; + gpu_sources[i].formula = sources[i].formula; + gpu_sources[i].flags = sources[i].flags; + gpu_sources[i].reserved0 = 0; + gpu_sources[i].anglescale = sources[i].anglescale; + gpu_sources[i].dirt = sources[i].dirt; + gpu_sources[i].falloff = sources[i].falloff; + gpu_sources[i].pad0 = 0.0f; + } + + std::vector gpu_face_ranges(face_range_count); + for (std::size_t i = 0; i < face_range_count; ++i) { + gpu_face_ranges[i].source_begin = face_ranges[i].source_begin; + gpu_face_ranges[i].source_count = face_ranges[i].source_count; + } + + std::vector gpu_face_source_indices(face_source_index_count); + std::memcpy(gpu_face_source_indices.data(), face_source_indices, sizeof(std::uint32_t) * face_source_index_count); + + std::vector zero_accum(sample_count); + + buffer_t sample_buffer; + buffer_t source_buffer; + buffer_t face_range_buffer; + buffer_t face_source_index_buffer; + buffer_t accum_buffer; + + bool ok = create_buffer(sizeof(gpu_direct_phase_sample_host_t) * sample_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + sample_buffer, + error, + gpu_samples.data()); + if (!ok) return false; + + ok = create_buffer(sizeof(gpu_direct_phase_source_host_t) * source_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + source_buffer, + error, + gpu_sources.data()); + if (!ok) { + destroy_buffer(sample_buffer); + return false; + } + + ok = create_buffer(sizeof(gpu_direct_phase_face_range_host_t) * face_range_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + face_range_buffer, + error, + gpu_face_ranges.data()); + if (!ok) { + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + + ok = create_buffer(sizeof(std::uint32_t) * face_source_index_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + face_source_index_buffer, + error, + gpu_face_source_indices.data()); + if (!ok) { + destroy_buffer(face_range_buffer); + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + + ok = create_buffer(sizeof(gpu_direct_accum_host_t) * sample_count, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + accum_buffer, + error, + zero_accum.data()); + if (!ok) { + destroy_buffer(face_source_index_buffer); + destroy_buffer(face_range_buffer); + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return false; + } + + update_direct_descriptor_set(sample_buffer, source_buffer, face_range_buffer, face_source_index_buffer, accum_buffer); + + direct_push_constants_t pc{}; + pc.sample_count = static_cast(sample_count); + pc.source_count = static_cast(source_count); + pc.flags = 0; + pc.reserved0 = 0; + + ok = one_time_submit([&](VkCommandBuffer cmd) { + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline); + vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, g.direct_pipeline_layout, 0, 1, &g.direct_descriptor_set, 0, nullptr); + vkCmdPushConstants(cmd, g.direct_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cmd, (pc.sample_count + 63u) / 64u, 1, 1); + }, error); + + if (ok) { + void *mapped = nullptr; + ok = check(vkMapMemory(g.device, accum_buffer.memory, 0, accum_buffer.size, 0, &mapped), "vkMapMemory(direct phase accum)", error); + if (ok) { + const auto *gpu_accum = static_cast(mapped); + for (std::size_t i = 0; i < sample_count; ++i) { + accum[i].cr = gpu_accum[i].cr; + accum[i].cg = gpu_accum[i].cg; + accum[i].cb = gpu_accum[i].cb; + accum[i].pad0 = 0.0f; + accum[i].nr = gpu_accum[i].nr; + accum[i].ng = gpu_accum[i].ng; + accum[i].nb = gpu_accum[i].nb; + accum[i].pad1 = 0.0f; + accum[i].hit = gpu_accum[i].hit; + accum[i].reserved0 = 0; + accum[i].reserved1 = 0; + accum[i].reserved2 = 0; + } + vkUnmapMemory(g.device, accum_buffer.memory); + } + } + + destroy_buffer(accum_buffer); + destroy_buffer(face_source_index_buffer); + destroy_buffer(face_range_buffer); + destroy_buffer(source_buffer); + destroy_buffer(sample_buffer); + return ok; +} + +bool trace_direct_accumulate_batch( + const modelinfo_t *self, + std::uint32_t shadow_mask, + const gpu_light::direct_job_t *jobs, + std::size_t job_count, + const gpu_light::direct_sample_range_t *ranges, + gpu_light::direct_accum_t *accum, + std::size_t sample_count, + std::string &error) { + (void)self; + (void)shadow_mask; + (void)jobs; + (void)job_count; + (void)ranges; + (void)accum; + (void)sample_count; + error = "old direct job buffer path disabled in v5; use trace_direct_phase_batch"; + return false; +} + +} // namespace gpu_light::vulkan_backend + +#endif // HAVE_GPU_LIGHT