From b79a618f41093c080e395fc7fa7b60b41bd374da Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Thu, 2 Apr 2026 22:19:37 +0100
Subject: [PATCH 1/5] feat: integrate GPU frustum culling into WorldRenderer
 (#379)

Replace the CPU-side frustum culling loop in WorldRenderer.render() with
GPU compute shader dispatch via CullingSystem. The culling shader now writes
visible chunk indices instead of placeholder DrawIndirectCommands. CPU reads
back visible indices and builds proper MDI draw commands with real mesh data.

- Updated culling.comp to write compact visible index list
- CullingSystem uses host-visible index buffers for direct readback
- WorldRenderer.render() uses GPU culling with CPU fallback
- Shadow pass retains CPU culling (per-chunk model matrix needed)
- Fixes import paths in culling_system.zig for proper module resolution
- Passes RHI to WorldRenderer.init() for CullingSystem creation
- Runtime detection: falls back to CPU if GPU culling init fails
---
 assets/shaders/vulkan/culling.comp            |  22 +--
 src/engine/graphics/vulkan/culling_system.zig |  46 +++---
 src/world/world.zig                           |   2 +-
 src/world/world_renderer.zig                  | 134 ++++++++++++++----
 4 files changed, 143 insertions(+), 61 deletions(-)

diff --git a/assets/shaders/vulkan/culling.comp b/assets/shaders/vulkan/culling.comp
index 8865da64..ad72efe2 100644
--- a/assets/shaders/vulkan/culling.comp
+++ b/assets/shaders/vulkan/culling.comp
@@ -2,13 +2,6 @@
 
 layout(local_size_x = 64) in;
 
-struct DrawIndirectCommand {
-    uint vertexCount;
-    uint instanceCount;
-    uint firstVertex;
-    uint firstInstance;
-};
-
 struct ChunkAABB {
     vec4 min_point;
     vec4 max_point;
@@ -18,13 +11,13 @@ layout(std430, binding = 0) readonly buffer ChunkAABBs {
     ChunkAABB chunks[];
 } aabb_buffer;
 
-layout(std430, binding = 1) coherent writeonly buffer DrawCommands {
-    uint visible_count;
+layout(std430, binding = 1) writeonly buffer VisibleIndices {
+    uint count;
     uint _pad0;
     uint _pad1;
     uint _pad2;
-    DrawIndirectCommand commands[];
-} cmd_buffer;
+    uint indices[];
+} visible_buffer;
 
 layout(std430, binding = 2) coherent buffer VisibleCountBuffer {
     uint count;
@@ -68,11 +61,8 @@ void main() {
     if (aabbVisible(aabb_min, aabb_max)) {
         uint slot = atomicAdd(visible_counter.count, 1);
 
-        if (slot < cmd_buffer.commands.length()) {
-            cmd_buffer.commands[slot].vertexCount = 0;
-            cmd_buffer.commands[slot].instanceCount = 1;
-            cmd_buffer.commands[slot].firstVertex = 0;
-            cmd_buffer.commands[slot].firstInstance = idx;
+        if (slot < visible_buffer.indices.length()) {
+            visible_buffer.indices[slot] = idx;
         }
     }
 }
diff --git a/src/engine/graphics/vulkan/culling_system.zig b/src/engine/graphics/vulkan/culling_system.zig
index 0b2a7393..e5d30989 100644
--- a/src/engine/graphics/vulkan/culling_system.zig
+++ b/src/engine/graphics/vulkan/culling_system.zig
@@ -1,8 +1,8 @@
 const std = @import("std");
-const c = @import("../../c.zig").c;
+const c = @import("../../../c.zig").c;
 const rhi_pkg = @import("../rhi.zig");
-const log = @import("../core/log.zig");
-const Mat4 = @import("../math/mat4.zig").Mat4;
+const log = @import("../../core/log.zig");
+const Mat4 = @import("../../math/mat4.zig").Mat4;
 const VulkanContext = @import("rhi_context_types.zig").VulkanContext;
 const Utils = @import("utils.zig");
 
@@ -26,7 +26,7 @@ pub const CullingSystem = struct {
     vk_ctx: *VulkanContext,
 
     aabb_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer,
-    command_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer,
+    visible_index_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer,
     counter_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer,
     counter_readback_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer,
 
@@ -59,7 +59,7 @@ pub const CullingSystem = struct {
             .vk_ctx = vk_ctx,
             .max_chunks = clamped_max,
             .aabb_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer),
-            .command_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer),
+            .visible_index_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer),
             .counter_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer),
             .counter_readback_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer),
             .descriptor_sets = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]c.VkDescriptorSet),
@@ -68,7 +68,7 @@ pub const CullingSystem = struct {
 
         errdefer self.destroyAllBuffers();
         const aabb_size = clamped_max * @sizeOf(ChunkCullData);
-        const cmd_size = @sizeOf(u32) * 4 + clamped_max * @sizeOf(c.VkDrawIndirectCommand);
+        const index_buffer_size = @sizeOf(u32) * 4 + clamped_max * @sizeOf(u32);
 
         for (0..MAX_FRAMES_IN_FLIGHT) |i| {
             self.aabb_buffers[i] = try Utils.createVulkanBuffer(
@@ -78,11 +78,11 @@ pub const CullingSystem = struct {
                 c.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | c.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
             );
 
-            self.command_buffers[i] = try Utils.createVulkanBuffer(
+            self.visible_index_buffers[i] = try Utils.createVulkanBuffer(
                 &vk_ctx.vulkan_device,
-                cmd_size,
-                c.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | c.VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT,
-                c.VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                index_buffer_size,
+                c.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+                c.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | c.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
             );
 
             self.counter_buffers[i] = try Utils.createVulkanBuffer(
@@ -189,11 +189,11 @@ pub const CullingSystem = struct {
         var compute_barrier = std.mem.zeroes(c.VkMemoryBarrier);
         compute_barrier.sType = c.VK_STRUCTURE_TYPE_MEMORY_BARRIER;
         compute_barrier.srcAccessMask = c.VK_ACCESS_SHADER_WRITE_BIT;
-        compute_barrier.dstAccessMask = c.VK_ACCESS_INDIRECT_COMMAND_READ_BIT | c.VK_ACCESS_TRANSFER_READ_BIT;
+        compute_barrier.dstAccessMask = c.VK_ACCESS_HOST_READ_BIT | c.VK_ACCESS_TRANSFER_READ_BIT;
         c.vkCmdPipelineBarrier(
             cmd,
             c.VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-            c.VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | c.VK_PIPELINE_STAGE_TRANSFER_BIT,
+            c.VK_PIPELINE_STAGE_HOST_BIT | c.VK_PIPELINE_STAGE_TRANSFER_BIT,
             0,
             1,
             &compute_barrier,
@@ -213,6 +213,16 @@ pub const CullingSystem = struct {
         return ptr.*;
     }
 
+    pub fn readVisibleIndices(self: *CullingSystem, frame_index: usize, count: u32, out: []u32) void {
+        if (count == 0) return;
+        const buf = &self.visible_index_buffers[frame_index];
+        if (buf.mapped_ptr == null) return;
+        const copy_count = @min(@as(usize, @intCast(count)), @min(out.len, self.max_chunks));
+        if (copy_count == 0) return;
+        const src: [*]const u32 = @ptrCast(@alignCast(buf.mapped_ptr.?));
+        @memcpy(out[0..copy_count], src[2 .. 2 + copy_count]);
+    }
+
     fn copyCounterToReadback(self: *CullingSystem, cmd: c.VkCommandBuffer, frame_index: usize) void {
         const src = self.counter_buffers[frame_index];
         const dst = self.counter_readback_buffers[frame_index];
@@ -336,7 +346,7 @@ pub const CullingSystem = struct {
 
         var writes: [3 * MAX_FRAMES_IN_FLIGHT]c.VkWriteDescriptorSet = undefined;
         var aabb_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined;
-        var cmd_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined;
+        var index_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined;
         var counter_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined;
         var n: usize = 0;
 
@@ -346,8 +356,8 @@ pub const CullingSystem = struct {
                 .offset = 0,
                 .range = aabb_range,
             };
-            cmd_infos[i] = c.VkDescriptorBufferInfo{
-                .buffer = self.command_buffers[i].buffer,
+            index_infos[i] = c.VkDescriptorBufferInfo{
+                .buffer = self.visible_index_buffers[i].buffer,
                 .offset = 0,
                 .range = c.VK_WHOLE_SIZE,
             };
@@ -372,7 +382,7 @@ pub const CullingSystem = struct {
             writes[n].dstBinding = 1;
             writes[n].descriptorType = c.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
             writes[n].descriptorCount = 1;
-            writes[n].pBufferInfo = &cmd_infos[i];
+            writes[n].pBufferInfo = &index_infos[i];
             n += 1;
 
             writes[n] = std.mem.zeroes(c.VkWriteDescriptorSet);
@@ -407,13 +417,13 @@ pub const CullingSystem = struct {
 
         for (0..MAX_FRAMES_IN_FLIGHT) |i| {
             unmapAndDestroy(vk, &self.aabb_buffers[i]);
-            unmapAndDestroy(vk, &self.command_buffers[i]);
+            unmapAndDestroy(vk, &self.visible_index_buffers[i]);
             unmapAndDestroy(vk, &self.counter_buffers[i]);
             unmapAndDestroy(vk, &self.counter_readback_buffers[i]);
         }
 
         self.aabb_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer);
-        self.command_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer);
+        self.visible_index_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer);
         self.counter_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer);
         self.counter_readback_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer);
     }
diff --git a/src/world/world.zig b/src/world/world.zig
index cfa843b3..f131350f 100644
--- a/src/world/world.zig
+++ b/src/world/world.zig
@@ -157,7 +157,7 @@ pub const World = struct {
         };
 
         log.log.info("World.initGen: initializing WorldRenderer", .{});
-        world.renderer = try WorldRenderer.init(allocator, rhi.resourceManager(), rhi.renderContext(), rhi.query(), &world.storage);
+        world.renderer = try WorldRenderer.init(allocator, rhi.resourceManager(), rhi.renderContext(), rhi.query(), &world.storage, rhi);
         errdefer _ = world.renderer;
 
         log.log.info("World.initGen: initializing WorldStreamer (render_distance={})", .{safe_render_distance});
diff --git a/src/world/world_renderer.zig b/src/world/world_renderer.zig
index db6d9572..4d3f292b 100644
--- a/src/world/world_renderer.zig
+++ b/src/world/world_renderer.zig
@@ -1,4 +1,5 @@
 //! World renderer - handles chunk rendering, culling, and MDI.
+//! Integrates GPU compute frustum culling (CullingSystem) with CPU fallback.
 
 const std = @import("std");
 const log = @import("../engine/core/log.zig");
@@ -7,6 +8,7 @@ const ChunkStorage = @import("chunk_storage.zig").ChunkStorage;
 const worldToChunk = @import("chunk.zig").worldToChunk;
 const CHUNK_SIZE_X = @import("chunk.zig").CHUNK_SIZE_X;
 const CHUNK_SIZE_Z = @import("chunk.zig").CHUNK_SIZE_Z;
+const CHUNK_SIZE_Y = @import("chunk.zig").CHUNK_SIZE_Y;
 const GlobalVertexAllocator = @import("chunk_allocator.zig").GlobalVertexAllocator;
 const rhi_mod = @import("../engine/graphics/rhi.zig");
 const ResourceManager = rhi_mod.ResourceManager;
@@ -16,6 +18,8 @@ const LODManager = @import("lod_manager.zig").LODManager;
 const Vec3 = @import("../engine/math/vec3.zig").Vec3;
 const Mat4 = @import("../engine/math/mat4.zig").Mat4;
 const Frustum = @import("../engine/math/frustum.zig").Frustum;
+const CullingSystem = @import("../engine/graphics/vulkan/culling_system.zig").CullingSystem;
+const ChunkCullData = @import("../engine/graphics/vulkan/culling_system.zig").ChunkCullData;
 
 const MAX_MDI_CHUNKS: usize = 16384;
 
@@ -24,6 +28,7 @@ pub const RenderStats = struct {
     chunks_rendered: u32 = 0,
     chunks_culled: u32 = 0,
     vertices_rendered: u64 = 0,
+    gpu_culling: bool = false,
 };
 
 pub const ShadowStats = struct {
@@ -49,7 +54,14 @@ pub const WorldRenderer = struct {
     instance_buffers: [rhi_mod.MAX_FRAMES_IN_FLIGHT]rhi_mod.BufferHandle,
     indirect_buffers: [rhi_mod.MAX_FRAMES_IN_FLIGHT]rhi_mod.BufferHandle,
 
-    pub fn init(allocator: std.mem.Allocator, rm: ResourceManager, render_ctx: RenderContext, query: IDeviceQuery, storage: *ChunkStorage) !*WorldRenderer {
+    // GPU Culling
+    culling_system: ?*CullingSystem,
+    aabb_data: std.ArrayListUnmanaged(ChunkCullData),
+    chunk_lookup: std.ArrayListUnmanaged(*ChunkData),
+    gpu_visible_indices: std.ArrayListUnmanaged(u32),
+    use_gpu_culling: bool,
+
+    pub fn init(allocator: std.mem.Allocator, rm: ResourceManager, render_ctx: RenderContext, query: IDeviceQuery, storage: *ChunkStorage, rhi: rhi_mod.RHI) !*WorldRenderer {
         const renderer = try allocator.create(WorldRenderer);
 
         const safe_mode_env = std.posix.getenv("ZIGCRAFT_SAFE_MODE");
@@ -74,6 +86,16 @@ pub const WorldRenderer = struct {
             indirect_buffers[i] = try rm.createBuffer(max_chunks * @sizeOf(rhi_mod.DrawIndirectCommand) * 3, .indirect);
         }
 
+        var culling_system: ?*CullingSystem = null;
+        var use_gpu = false;
+        if (CullingSystem.init(allocator, rhi, max_chunks)) |cs| {
+            culling_system = cs;
+            use_gpu = true;
+            log.log.info("GPU frustum culling initialized (max_chunks={})", .{max_chunks});
+        } else |_| {
+            log.log.warn("GPU culling init failed, falling back to CPU culling", .{});
+        }
+
         renderer.* = .{
             .allocator = allocator,
             .storage = storage,
@@ -88,6 +110,11 @@ pub const WorldRenderer = struct {
             .draw_commands = .empty,
             .instance_buffers = instance_buffers,
             .indirect_buffers = indirect_buffers,
+            .culling_system = culling_system,
+            .aabb_data = .empty,
+            .chunk_lookup = .empty,
+            .gpu_visible_indices = .empty,
+            .use_gpu_culling = use_gpu,
         };
 
         return renderer;
@@ -98,16 +125,15 @@ pub const WorldRenderer = struct {
         self.vertex_allocator.tick(self.query.getFrameIndex());
     }
 
-    /// Reset shadow statistics before a new frame begins.
-    ///
-    /// `last_shadow_stats` then accumulates across all shadow passes in that frame.
-    /// If per-cascade statistics are needed, call this before each shadow pass.
     pub fn resetShadowStats(self: *WorldRenderer) void {
         self.last_shadow_stats = .{};
     }
 
     pub fn deinit(self: *WorldRenderer) void {
         self.visible_chunks.deinit(self.allocator);
+        self.aabb_data.deinit(self.allocator);
+        self.chunk_lookup.deinit(self.allocator);
+        self.gpu_visible_indices.deinit(self.allocator);
 
         for (0..rhi_mod.MAX_FRAMES_IN_FLIGHT) |i| {
             if (self.instance_buffers[i] != 0) self.rm.destroyBuffer(self.instance_buffers[i]);
@@ -116,13 +142,15 @@ pub const WorldRenderer = struct {
         self.instance_data.deinit(self.allocator);
         self.draw_commands.deinit(self.allocator);
 
+        if (self.culling_system) |cs| cs.deinit();
+
         self.vertex_allocator.deinit();
         self.allocator.destroy(self.vertex_allocator);
         self.allocator.destroy(self);
     }
 
     pub fn render(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, render_distance: i32, lod_manager: ?*LODManager, render_lod: bool) void {
-        self.last_render_stats = .{};
+        self.last_render_stats = .{ .gpu_culling = self.use_gpu_culling };
 
         self.storage.chunks_mutex.lockShared();
         defer self.storage.chunks_mutex.unlockShared();
@@ -137,8 +165,6 @@ pub const WorldRenderer = struct {
         self.instance_data.clearRetainingCapacity();
         self.draw_commands.clearRetainingCapacity();
 
-        const frustum = Frustum.fromViewProj(view_proj);
-
         if (!std.math.isFinite(camera_pos.x) or !std.math.isFinite(camera_pos.z)) return;
 
         const world_x: i64 = @intFromFloat(camera_pos.x);
@@ -149,22 +175,10 @@ pub const WorldRenderer = struct {
         const r_dist_val: i32 = if (lod_manager) |mgr| @min(render_distance, @as(i32, @intCast(mgr.config.getRadii()[0]))) else render_distance;
         const r_dist: i64 = @as(i64, @intCast(r_dist_val));
 
-        var cz = pc_z - r_dist;
-        while (cz <= pc_z + r_dist) : (cz += 1) {
-            var cx = pc_x - r_dist;
-            while (cx <= pc_x + r_dist) : (cx += 1) {
-                if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| {
-                    if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) {
-                        if (frustum.intersectsChunkRelative(@as(i32, @intCast(cx)), @as(i32, @intCast(cz)), camera_pos.x, camera_pos.y, camera_pos.z)) {
-                            self.visible_chunks.append(self.allocator, data) catch |err| {
-                                log.log.debug("MDI: visible_chunks append failed: {}", .{err});
-                            };
-                        } else {
-                            self.last_render_stats.chunks_culled += 1;
-                        }
-                    }
-                }
-            }
+        if (self.use_gpu_culling) {
+            self.renderGpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist);
+        } else {
+            self.renderCpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist);
         }
 
         self.last_render_stats.chunks_total = @intCast(self.storage.chunks.count());
@@ -259,8 +273,76 @@ pub const WorldRenderer = struct {
         }
     }
 
-    /// Intentionally excludes visual LOD meshes to prevent LOD offset/morphing
-    /// artifacts from corrupting shadow maps. Only real chunk geometry is rendered.
+    fn renderCpuCull(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, pc_x: i64, pc_z: i64, r_dist: i64) void {
+        const frustum = Frustum.fromViewProj(view_proj);
+
+        var cz = pc_z - r_dist;
+        while (cz <= pc_z + r_dist) : (cz += 1) {
+            var cx = pc_x - r_dist;
+            while (cx <= pc_x + r_dist) : (cx += 1) {
+                if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| {
+                    if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) {
+                        if (frustum.intersectsChunkRelative(@as(i32, @intCast(cx)), @as(i32, @intCast(cz)), camera_pos.x, camera_pos.y, camera_pos.z)) {
+                            self.visible_chunks.append(self.allocator, data) catch |err| {
+                                log.log.debug("MDI: visible_chunks append failed: {}", .{err});
+                            };
+                        } else {
+                            self.last_render_stats.chunks_culled += 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    fn renderGpuCull(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, pc_x: i64, pc_z: i64, r_dist: i64) void {
+        const cs = self.culling_system orelse unreachable;
+
+        self.aabb_data.clearRetainingCapacity();
+        self.chunk_lookup.clearRetainingCapacity();
+
+        var cz = pc_z - r_dist;
+        while (cz <= pc_z + r_dist) : (cz += 1) {
+            var cx = pc_x - r_dist;
+            while (cx <= pc_x + r_dist) : (cx += 1) {
+                if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| {
+                    if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) {
+                        const chunk_world_x: f32 = @floatFromInt(data.chunk.chunk_x * CHUNK_SIZE_X);
+                        const chunk_world_z: f32 = @floatFromInt(data.chunk.chunk_z * CHUNK_SIZE_Z);
+
+                        self.aabb_data.append(self.allocator, .{
+                            .min_point = .{ chunk_world_x - camera_pos.x, -camera_pos.y, chunk_world_z - camera_pos.z, 0.0 },
+                            .max_point = .{ chunk_world_x - camera_pos.x + @as(f32, @floatFromInt(CHUNK_SIZE_X)), -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y)), chunk_world_z - camera_pos.z + @as(f32, @floatFromInt(CHUNK_SIZE_Z)), 0.0 },
+                        }) catch continue;
+                        self.chunk_lookup.append(self.allocator, data) catch continue;
+                    }
+                }
+            }
+        }
+
+        const chunk_count: u32 = @intCast(self.aabb_data.items.len);
+        if (chunk_count == 0) return;
+
+        const fi = self.query.getFrameIndex();
+        cs.updateAABBData(fi, self.aabb_data.items);
+        cs.dispatch(view_proj, chunk_count);
+
+        const visible_count = cs.readVisibleCount(fi);
+        self.gpu_visible_indices.clearRetainingCapacity();
+        if (visible_count > 0) {
+            self.gpu_visible_indices.resize(self.allocator, visible_count) catch return;
+            cs.readVisibleIndices(fi, visible_count, self.gpu_visible_indices.items);
+
+            for (self.gpu_visible_indices.items[0..@min(@as(usize, @intCast(visible_count)), self.gpu_visible_indices.items.len)]) |idx| {
+                if (idx < self.chunk_lookup.items.len) {
+                    self.visible_chunks.append(self.allocator, self.chunk_lookup.items[idx]) catch continue;
+                }
+            }
+        }
+
+        self.last_render_stats.chunks_culled += @intCast(chunk_count - @min(@as(u32, @intCast(self.visible_chunks.items.len)), chunk_count));
+    }
+
     pub fn renderShadowPass(self: *WorldRenderer, light_space_matrix: Mat4, camera_pos: Vec3, shadow_caster_distance: f32) void {
         const frustum = Frustum.fromViewProj(light_space_matrix);
 

From 4a3cc8f86de33c0cea99f552d3204ade09e1f85e Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Thu, 2 Apr 2026 22:30:06 +0100
Subject: [PATCH 2/5] fix: address review feedback for GPU culling integration

- Fix incorrect buffer offset in readVisibleIndices (src[2] -> src[4])
  The shader has 4 u32 header (count + 3 padding), not 2
- Fix GPU/CPU sync: read back previous frame results before
  dispatching current frame. beginFrame waits on fence
  so previous frame's data is ready when render() is called
- Log specific error when GPU culling init fails
- Break long AABB construction lines for readability
---
 src/engine/graphics/vulkan/culling_system.zig |  2 +-
 src/world/world_renderer.zig                  | 30 ++++++++++++-------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/engine/graphics/vulkan/culling_system.zig b/src/engine/graphics/vulkan/culling_system.zig
index e5d30989..bbe46dba 100644
--- a/src/engine/graphics/vulkan/culling_system.zig
+++ b/src/engine/graphics/vulkan/culling_system.zig
@@ -220,7 +220,7 @@ pub const CullingSystem = struct {
         const copy_count = @min(@as(usize, @intCast(count)), @min(out.len, self.max_chunks));
         if (copy_count == 0) return;
         const src: [*]const u32 = @ptrCast(@alignCast(buf.mapped_ptr.?));
-        @memcpy(out[0..copy_count], src[2 .. 2 + copy_count]);
+        @memcpy(out[0..copy_count], src[4 .. 4 + copy_count]);
     }
 
     fn copyCounterToReadback(self: *CullingSystem, cmd: c.VkCommandBuffer, frame_index: usize) void {
diff --git a/src/world/world_renderer.zig b/src/world/world_renderer.zig
index 4d3f292b..5a6cc04e 100644
--- a/src/world/world_renderer.zig
+++ b/src/world/world_renderer.zig
@@ -92,8 +92,8 @@ pub const WorldRenderer = struct {
             culling_system = cs;
             use_gpu = true;
             log.log.info("GPU frustum culling initialized (max_chunks={})", .{max_chunks});
-        } else |_| {
-            log.log.warn("GPU culling init failed, falling back to CPU culling", .{});
+        } else |err| {
+            log.log.warn("GPU culling init failed ({}), falling back to CPU culling", .{err});
         }
 
         renderer.* = .{
@@ -310,9 +310,15 @@ pub const WorldRenderer = struct {
                         const chunk_world_x: f32 = @floatFromInt(data.chunk.chunk_x * CHUNK_SIZE_X);
                         const chunk_world_z: f32 = @floatFromInt(data.chunk.chunk_z * CHUNK_SIZE_Z);
 
+                        const min_x = chunk_world_x - camera_pos.x;
+                        const min_z = chunk_world_z - camera_pos.z;
+                        const max_x = min_x + @as(f32, @floatFromInt(CHUNK_SIZE_X));
+                        const max_y = -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y));
+                        const max_z = min_z + @as(f32, @floatFromInt(CHUNK_SIZE_Z));
+
                         self.aabb_data.append(self.allocator, .{
-                            .min_point = .{ chunk_world_x - camera_pos.x, -camera_pos.y, chunk_world_z - camera_pos.z, 0.0 },
-                            .max_point = .{ chunk_world_x - camera_pos.x + @as(f32, @floatFromInt(CHUNK_SIZE_X)), -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y)), chunk_world_z - camera_pos.z + @as(f32, @floatFromInt(CHUNK_SIZE_Z)), 0.0 },
+                            .min_point = .{ min_x, -camera_pos.y, min_z, 0.0 },
+                            .max_point = .{ max_x, max_y, max_z, 0.0 },
                         }) catch continue;
                         self.chunk_lookup.append(self.allocator, data) catch continue;
                     }
@@ -324,16 +330,15 @@ pub const WorldRenderer = struct {
         if (chunk_count == 0) return;
 
         const fi = self.query.getFrameIndex();
-        cs.updateAABBData(fi, self.aabb_data.items);
-        cs.dispatch(view_proj, chunk_count);
 
-        const visible_count = cs.readVisibleCount(fi);
+        const prev_visible_count = cs.readVisibleCount(fi);
         self.gpu_visible_indices.clearRetainingCapacity();
-        if (visible_count > 0) {
-            self.gpu_visible_indices.resize(self.allocator, visible_count) catch return;
-            cs.readVisibleIndices(fi, visible_count, self.gpu_visible_indices.items);
+        if (prev_visible_count > 0) {
+            self.gpu_visible_indices.resize(self.allocator, prev_visible_count) catch return;
+            cs.readVisibleIndices(fi, prev_visible_count, self.gpu_visible_indices.items);
 
-            for (self.gpu_visible_indices.items[0..@min(@as(usize, @intCast(visible_count)), self.gpu_visible_indices.items.len)]) |idx| {
+            const limit = @min(@as(usize, @intCast(prev_visible_count)), self.gpu_visible_indices.items.len);
+            for (self.gpu_visible_indices.items[0..limit]) |idx| {
                 if (idx < self.chunk_lookup.items.len) {
                     self.visible_chunks.append(self.allocator, self.chunk_lookup.items[idx]) catch continue;
                 }
@@ -341,6 +346,9 @@ pub const WorldRenderer = struct {
         }
 
         self.last_render_stats.chunks_culled += @intCast(chunk_count - @min(@as(u32, @intCast(self.visible_chunks.items.len)), chunk_count));
+
+        cs.updateAABBData(fi, self.aabb_data.items);
+        cs.dispatch(view_proj, chunk_count);
     }
 
     pub fn renderShadowPass(self: *WorldRenderer, light_space_matrix: Mat4, camera_pos: Vec3, shadow_caster_distance: f32) void {

From 84afe50934d5ea8805ee29787e13de7b68a93f13 Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Fri, 3 Apr 2026 00:09:26 +0100
Subject: [PATCH 3/5] fix: resolve chunk_lookup sync bug and improve GPU
 culling robustness

---
 src/world/world_renderer.zig | 70 ++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/src/world/world_renderer.zig b/src/world/world_renderer.zig
index 5a6cc04e..d7f593ac 100644
--- a/src/world/world_renderer.zig
+++ b/src/world/world_renderer.zig
@@ -295,8 +295,44 @@ pub const WorldRenderer = struct {
         }
     }
 
+    fn chunkAABB(chunk_x: i32, chunk_z: i32, camera_pos: Vec3) ChunkCullData {
+        const world_x: f32 = @floatFromInt(chunk_x * CHUNK_SIZE_X);
+        const world_z: f32 = @floatFromInt(chunk_z * CHUNK_SIZE_Z);
+        return .{
+            .min_point = .{ world_x - camera_pos.x, -camera_pos.y, world_z - camera_pos.z, 0.0 },
+            .max_point = .{
+                world_x - camera_pos.x + @as(f32, @floatFromInt(CHUNK_SIZE_X)),
+                -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y)),
+                world_z - camera_pos.z + @as(f32, @floatFromInt(CHUNK_SIZE_Z)),
+                0.0,
+            },
+        };
+    }
+
     fn renderGpuCull(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, pc_x: i64, pc_z: i64, r_dist: i64) void {
-        const cs = self.culling_system orelse unreachable;
+        const cs = self.culling_system orelse {
+            log.log.err("GPU culling enabled but system is null, falling back to CPU", .{});
+            self.use_gpu_culling = false;
+            return self.renderCpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist);
+        };
+
+        const fi = self.query.getFrameIndex();
+
+        const prev_visible_count = cs.readVisibleCount(fi);
+        self.gpu_visible_indices.clearRetainingCapacity();
+        if (prev_visible_count > 0) {
+            self.gpu_visible_indices.resize(self.allocator, prev_visible_count) catch return;
+            cs.readVisibleIndices(fi, prev_visible_count, self.gpu_visible_indices.items);
+
+            const limit = @min(@as(usize, @intCast(prev_visible_count)), self.gpu_visible_indices.items.len);
+            for (self.gpu_visible_indices.items[0..limit]) |idx| {
+                if (idx < self.chunk_lookup.items.len) {
+                    self.visible_chunks.append(self.allocator, self.chunk_lookup.items[idx]) catch continue;
+                }
+            }
+        }
+
+        const prev_rendered: u32 = @intCast(self.visible_chunks.items.len);
 
         self.aabb_data.clearRetainingCapacity();
         self.chunk_lookup.clearRetainingCapacity();
@@ -307,19 +343,7 @@ pub const WorldRenderer = struct {
             while (cx <= pc_x + r_dist) : (cx += 1) {
                 if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| {
                     if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) {
-                        const chunk_world_x: f32 = @floatFromInt(data.chunk.chunk_x * CHUNK_SIZE_X);
-                        const chunk_world_z: f32 = @floatFromInt(data.chunk.chunk_z * CHUNK_SIZE_Z);
-
-                        const min_x = chunk_world_x - camera_pos.x;
-                        const min_z = chunk_world_z - camera_pos.z;
-                        const max_x = min_x + @as(f32, @floatFromInt(CHUNK_SIZE_X));
-                        const max_y = -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y));
-                        const max_z = min_z + @as(f32, @floatFromInt(CHUNK_SIZE_Z));
-
-                        self.aabb_data.append(self.allocator, .{
-                            .min_point = .{ min_x, -camera_pos.y, min_z, 0.0 },
-                            .max_point = .{ max_x, max_y, max_z, 0.0 },
-                        }) catch continue;
+                        self.aabb_data.append(self.allocator, chunkAABB(data.chunk.chunk_x, data.chunk.chunk_z, camera_pos)) catch continue;
                         self.chunk_lookup.append(self.allocator, data) catch continue;
                     }
                 }
@@ -329,23 +353,7 @@ pub const WorldRenderer = struct {
         const chunk_count: u32 = @intCast(self.aabb_data.items.len);
         if (chunk_count == 0) return;
 
-        const fi = self.query.getFrameIndex();
-
-        const prev_visible_count = cs.readVisibleCount(fi);
-        self.gpu_visible_indices.clearRetainingCapacity();
-        if (prev_visible_count > 0) {
-            self.gpu_visible_indices.resize(self.allocator, prev_visible_count) catch return;
-            cs.readVisibleIndices(fi, prev_visible_count, self.gpu_visible_indices.items);
-
-            const limit = @min(@as(usize, @intCast(prev_visible_count)), self.gpu_visible_indices.items.len);
-            for (self.gpu_visible_indices.items[0..limit]) |idx| {
-                if (idx < self.chunk_lookup.items.len) {
-                    self.visible_chunks.append(self.allocator, self.chunk_lookup.items[idx]) catch continue;
-                }
-            }
-        }
-
-        self.last_render_stats.chunks_culled += @intCast(chunk_count - @min(@as(u32, @intCast(self.visible_chunks.items.len)), chunk_count));
+        self.last_render_stats.chunks_culled += chunk_count - @min(prev_rendered, chunk_count);
 
         cs.updateAABBData(fi, self.aabb_data.items);
         cs.dispatch(view_proj, chunk_count);

From 109355a4c839b6732dbcad82fc5e0f5939d87e36 Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Fri, 3 Apr 2026 00:25:08 +0100
Subject: [PATCH 4/5] chore: trigger PR refresh


From f0844f9644e7335b67840ae4f03e6b2fd86c64b3 Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Fri, 3 Apr 2026 00:37:21 +0100
Subject: [PATCH 5/5] fix: double-buffer GPU chunk lookup

---
 src/world/world_renderer.zig | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/world/world_renderer.zig b/src/world/world_renderer.zig
index d7f593ac..96da0f66 100644
--- a/src/world/world_renderer.zig
+++ b/src/world/world_renderer.zig
@@ -57,7 +57,7 @@ pub const WorldRenderer = struct {
     // GPU Culling
     culling_system: ?*CullingSystem,
     aabb_data: std.ArrayListUnmanaged(ChunkCullData),
-    chunk_lookup: std.ArrayListUnmanaged(*ChunkData),
+    chunk_lookup: [rhi_mod.MAX_FRAMES_IN_FLIGHT]std.ArrayListUnmanaged(*ChunkData),
     gpu_visible_indices: std.ArrayListUnmanaged(u32),
     use_gpu_culling: bool,
 
@@ -112,11 +112,13 @@ pub const WorldRenderer = struct {
             .indirect_buffers = indirect_buffers,
             .culling_system = culling_system,
             .aabb_data = .empty,
-            .chunk_lookup = .empty,
+            .chunk_lookup = undefined,
             .gpu_visible_indices = .empty,
             .use_gpu_culling = use_gpu,
         };
 
+        for (&renderer.chunk_lookup) |*lookup| lookup.* = .empty;
+
         return renderer;
     }
 
@@ -132,7 +134,7 @@ pub const WorldRenderer = struct {
     pub fn deinit(self: *WorldRenderer) void {
         self.visible_chunks.deinit(self.allocator);
         self.aabb_data.deinit(self.allocator);
-        self.chunk_lookup.deinit(self.allocator);
+        for (&self.chunk_lookup) |*lookup| lookup.deinit(self.allocator);
         self.gpu_visible_indices.deinit(self.allocator);
 
         for (0..rhi_mod.MAX_FRAMES_IN_FLIGHT) |i| {
@@ -317,17 +319,18 @@ pub const WorldRenderer = struct {
         };
 
         const fi = self.query.getFrameIndex();
+        const prev_fi = (fi + rhi_mod.MAX_FRAMES_IN_FLIGHT - 1) % rhi_mod.MAX_FRAMES_IN_FLIGHT;
 
-        const prev_visible_count = cs.readVisibleCount(fi);
+        const prev_visible_count = cs.readVisibleCount(prev_fi);
         self.gpu_visible_indices.clearRetainingCapacity();
         if (prev_visible_count > 0) {
             self.gpu_visible_indices.resize(self.allocator, prev_visible_count) catch return;
-            cs.readVisibleIndices(fi, prev_visible_count, self.gpu_visible_indices.items);
+            cs.readVisibleIndices(prev_fi, prev_visible_count, self.gpu_visible_indices.items);
 
             const limit = @min(@as(usize, @intCast(prev_visible_count)), self.gpu_visible_indices.items.len);
             for (self.gpu_visible_indices.items[0..limit]) |idx| {
-                if (idx < self.chunk_lookup.items.len) {
-                    self.visible_chunks.append(self.allocator, self.chunk_lookup.items[idx]) catch continue;
+                if (idx < self.chunk_lookup[prev_fi].items.len) {
+                    self.visible_chunks.append(self.allocator, self.chunk_lookup[prev_fi].items[idx]) catch continue;
                 }
             }
         }
@@ -335,7 +338,7 @@ pub const WorldRenderer = struct {
         const prev_rendered: u32 = @intCast(self.visible_chunks.items.len);
 
         self.aabb_data.clearRetainingCapacity();
-        self.chunk_lookup.clearRetainingCapacity();
+        self.chunk_lookup[fi].clearRetainingCapacity();
 
         var cz = pc_z - r_dist;
         while (cz <= pc_z + r_dist) : (cz += 1) {
@@ -344,7 +347,7 @@ pub const WorldRenderer = struct {
                 if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| {
                     if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) {
                         self.aabb_data.append(self.allocator, chunkAABB(data.chunk.chunk_x, data.chunk.chunk_z, camera_pos)) catch continue;
-                        self.chunk_lookup.append(self.allocator, data) catch continue;
+                        self.chunk_lookup[fi].append(self.allocator, data) catch continue;
                     }
                 }
             }