diff --git a/assets/shaders/vulkan/culling.comp b/assets/shaders/vulkan/culling.comp index 8865da6..ad72efe 100644 --- a/assets/shaders/vulkan/culling.comp +++ b/assets/shaders/vulkan/culling.comp @@ -2,13 +2,6 @@ layout(local_size_x = 64) in; -struct DrawIndirectCommand { - uint vertexCount; - uint instanceCount; - uint firstVertex; - uint firstInstance; -}; - struct ChunkAABB { vec4 min_point; vec4 max_point; @@ -18,13 +11,13 @@ layout(std430, binding = 0) readonly buffer ChunkAABBs { ChunkAABB chunks[]; } aabb_buffer; -layout(std430, binding = 1) coherent writeonly buffer DrawCommands { - uint visible_count; +layout(std430, binding = 1) writeonly buffer VisibleIndices { + uint count; uint _pad0; uint _pad1; uint _pad2; - DrawIndirectCommand commands[]; -} cmd_buffer; + uint indices[]; +} visible_buffer; layout(std430, binding = 2) coherent buffer VisibleCountBuffer { uint count; @@ -68,11 +61,8 @@ void main() { if (aabbVisible(aabb_min, aabb_max)) { uint slot = atomicAdd(visible_counter.count, 1); - if (slot < cmd_buffer.commands.length()) { - cmd_buffer.commands[slot].vertexCount = 0; - cmd_buffer.commands[slot].instanceCount = 1; - cmd_buffer.commands[slot].firstVertex = 0; - cmd_buffer.commands[slot].firstInstance = idx; + if (slot < visible_buffer.indices.length()) { + visible_buffer.indices[slot] = idx; } } } diff --git a/src/engine/graphics/vulkan/culling_system.zig b/src/engine/graphics/vulkan/culling_system.zig index 0b2a739..bbe46db 100644 --- a/src/engine/graphics/vulkan/culling_system.zig +++ b/src/engine/graphics/vulkan/culling_system.zig @@ -1,8 +1,8 @@ const std = @import("std"); -const c = @import("../../c.zig").c; +const c = @import("../../../c.zig").c; const rhi_pkg = @import("../rhi.zig"); -const log = @import("../core/log.zig"); -const Mat4 = @import("../math/mat4.zig").Mat4; +const log = @import("../../core/log.zig"); +const Mat4 = @import("../../math/mat4.zig").Mat4; const VulkanContext = @import("rhi_context_types.zig").VulkanContext; const Utils = @import("utils.zig"); @@ -26,7 +26,7 @@ pub const CullingSystem = struct { vk_ctx: *VulkanContext, aabb_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer, - command_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer, + visible_index_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer, counter_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer, counter_readback_buffers: [MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer, @@ -59,7 +59,7 @@ pub const CullingSystem = struct { .vk_ctx = vk_ctx, .max_chunks = clamped_max, .aabb_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer), - .command_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer), + .visible_index_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer), .counter_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer), .counter_readback_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer), .descriptor_sets = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]c.VkDescriptorSet), @@ -68,7 +68,7 @@ pub const CullingSystem = struct { errdefer self.destroyAllBuffers(); const aabb_size = clamped_max * @sizeOf(ChunkCullData); - const cmd_size = @sizeOf(u32) * 4 + clamped_max * @sizeOf(c.VkDrawIndirectCommand); + const index_buffer_size = @sizeOf(u32) * 4 + clamped_max * @sizeOf(u32); for (0..MAX_FRAMES_IN_FLIGHT) |i| { self.aabb_buffers[i] = try Utils.createVulkanBuffer( @@ -78,11 +78,11 @@ pub const CullingSystem = struct { c.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | c.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ); - self.command_buffers[i] = try Utils.createVulkanBuffer( + self.visible_index_buffers[i] = try Utils.createVulkanBuffer( &vk_ctx.vulkan_device, - cmd_size, - c.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | c.VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT, - c.VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + index_buffer_size, + c.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + c.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | c.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ); self.counter_buffers[i] = try Utils.createVulkanBuffer( @@ -189,11 +189,11 @@ pub const CullingSystem = struct { var compute_barrier = std.mem.zeroes(c.VkMemoryBarrier); compute_barrier.sType = c.VK_STRUCTURE_TYPE_MEMORY_BARRIER; compute_barrier.srcAccessMask = c.VK_ACCESS_SHADER_WRITE_BIT; - compute_barrier.dstAccessMask = c.VK_ACCESS_INDIRECT_COMMAND_READ_BIT | c.VK_ACCESS_TRANSFER_READ_BIT; + compute_barrier.dstAccessMask = c.VK_ACCESS_HOST_READ_BIT | c.VK_ACCESS_TRANSFER_READ_BIT; c.vkCmdPipelineBarrier( cmd, c.VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - c.VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | c.VK_PIPELINE_STAGE_TRANSFER_BIT, + c.VK_PIPELINE_STAGE_HOST_BIT | c.VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &compute_barrier, @@ -213,6 +213,16 @@ pub const CullingSystem = struct { return ptr.*; } + pub fn readVisibleIndices(self: *CullingSystem, frame_index: usize, count: u32, out: []u32) void { + if (count == 0) return; + const buf = &self.visible_index_buffers[frame_index]; + if (buf.mapped_ptr == null) return; + const copy_count = @min(@as(usize, @intCast(count)), @min(out.len, self.max_chunks)); + if (copy_count == 0) return; + const src: [*]const u32 = @ptrCast(@alignCast(buf.mapped_ptr.?)); + @memcpy(out[0..copy_count], src[4 .. 4 + copy_count]); + } + fn copyCounterToReadback(self: *CullingSystem, cmd: c.VkCommandBuffer, frame_index: usize) void { const src = self.counter_buffers[frame_index]; const dst = self.counter_readback_buffers[frame_index]; @@ -336,7 +346,7 @@ pub const CullingSystem = struct { var writes: [3 * MAX_FRAMES_IN_FLIGHT]c.VkWriteDescriptorSet = undefined; var aabb_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined; - var cmd_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined; + var index_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined; var counter_infos: [MAX_FRAMES_IN_FLIGHT]c.VkDescriptorBufferInfo = undefined; var n: usize = 0; @@ -346,8 +356,8 @@ pub const CullingSystem = struct { .offset = 0, .range = aabb_range, }; - cmd_infos[i] = c.VkDescriptorBufferInfo{ - .buffer = self.command_buffers[i].buffer, + index_infos[i] = c.VkDescriptorBufferInfo{ + .buffer = self.visible_index_buffers[i].buffer, .offset = 0, .range = c.VK_WHOLE_SIZE, }; @@ -372,7 +382,7 @@ pub const CullingSystem = struct { writes[n].dstBinding = 1; writes[n].descriptorType = c.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; writes[n].descriptorCount = 1; - writes[n].pBufferInfo = &cmd_infos[i]; + writes[n].pBufferInfo = &index_infos[i]; n += 1; writes[n] = std.mem.zeroes(c.VkWriteDescriptorSet); @@ -407,13 +417,13 @@ pub const CullingSystem = struct { for (0..MAX_FRAMES_IN_FLIGHT) |i| { unmapAndDestroy(vk, &self.aabb_buffers[i]); - unmapAndDestroy(vk, &self.command_buffers[i]); + unmapAndDestroy(vk, &self.visible_index_buffers[i]); unmapAndDestroy(vk, &self.counter_buffers[i]); unmapAndDestroy(vk, &self.counter_readback_buffers[i]); } self.aabb_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer); - self.command_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer); + self.visible_index_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer); self.counter_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer); self.counter_readback_buffers = std.mem.zeroes([MAX_FRAMES_IN_FLIGHT]Utils.VulkanBuffer); } diff --git a/src/world/world.zig b/src/world/world.zig index cfa843b..f131350 100644 --- a/src/world/world.zig +++ b/src/world/world.zig @@ -157,7 +157,7 @@ pub const World = struct { }; log.log.info("World.initGen: initializing WorldRenderer", .{}); - world.renderer = try WorldRenderer.init(allocator, rhi.resourceManager(), rhi.renderContext(), rhi.query(), &world.storage); + world.renderer = try WorldRenderer.init(allocator, rhi.resourceManager(), rhi.renderContext(), rhi.query(), &world.storage, rhi); errdefer _ = world.renderer; log.log.info("World.initGen: initializing WorldStreamer (render_distance={})", .{safe_render_distance}); diff --git a/src/world/world_renderer.zig b/src/world/world_renderer.zig index db6d957..96da0f6 100644 --- a/src/world/world_renderer.zig +++ b/src/world/world_renderer.zig @@ -1,4 +1,5 @@ //! World renderer - handles chunk rendering, culling, and MDI. +//! Integrates GPU compute frustum culling (CullingSystem) with CPU fallback. const std = @import("std"); const log = @import("../engine/core/log.zig"); @@ -7,6 +8,7 @@ const ChunkStorage = @import("chunk_storage.zig").ChunkStorage; const worldToChunk = @import("chunk.zig").worldToChunk; const CHUNK_SIZE_X = @import("chunk.zig").CHUNK_SIZE_X; const CHUNK_SIZE_Z = @import("chunk.zig").CHUNK_SIZE_Z; +const CHUNK_SIZE_Y = @import("chunk.zig").CHUNK_SIZE_Y; const GlobalVertexAllocator = @import("chunk_allocator.zig").GlobalVertexAllocator; const rhi_mod = @import("../engine/graphics/rhi.zig"); const ResourceManager = rhi_mod.ResourceManager; @@ -16,6 +18,8 @@ const LODManager = @import("lod_manager.zig").LODManager; const Vec3 = @import("../engine/math/vec3.zig").Vec3; const Mat4 = @import("../engine/math/mat4.zig").Mat4; const Frustum = @import("../engine/math/frustum.zig").Frustum; +const CullingSystem = @import("../engine/graphics/vulkan/culling_system.zig").CullingSystem; +const ChunkCullData = @import("../engine/graphics/vulkan/culling_system.zig").ChunkCullData; const MAX_MDI_CHUNKS: usize = 16384; @@ -24,6 +28,7 @@ pub const RenderStats = struct { chunks_rendered: u32 = 0, chunks_culled: u32 = 0, vertices_rendered: u64 = 0, + gpu_culling: bool = false, }; pub const ShadowStats = struct { @@ -49,7 +54,14 @@ pub const WorldRenderer = struct { instance_buffers: [rhi_mod.MAX_FRAMES_IN_FLIGHT]rhi_mod.BufferHandle, indirect_buffers: [rhi_mod.MAX_FRAMES_IN_FLIGHT]rhi_mod.BufferHandle, - pub fn init(allocator: std.mem.Allocator, rm: ResourceManager, render_ctx: RenderContext, query: IDeviceQuery, storage: *ChunkStorage) !*WorldRenderer { + // GPU Culling + culling_system: ?*CullingSystem, + aabb_data: std.ArrayListUnmanaged(ChunkCullData), + chunk_lookup: [rhi_mod.MAX_FRAMES_IN_FLIGHT]std.ArrayListUnmanaged(*ChunkData), + gpu_visible_indices: std.ArrayListUnmanaged(u32), + use_gpu_culling: bool, + + pub fn init(allocator: std.mem.Allocator, rm: ResourceManager, render_ctx: RenderContext, query: IDeviceQuery, storage: *ChunkStorage, rhi: rhi_mod.RHI) !*WorldRenderer { const renderer = try allocator.create(WorldRenderer); const safe_mode_env = std.posix.getenv("ZIGCRAFT_SAFE_MODE"); @@ -74,6 +86,16 @@ pub const WorldRenderer = struct { indirect_buffers[i] = try rm.createBuffer(max_chunks * @sizeOf(rhi_mod.DrawIndirectCommand) * 3, .indirect); } + var culling_system: ?*CullingSystem = null; + var use_gpu = false; + if (CullingSystem.init(allocator, rhi, max_chunks)) |cs| { + culling_system = cs; + use_gpu = true; + log.log.info("GPU frustum culling initialized (max_chunks={})", .{max_chunks}); + } else |err| { + log.log.warn("GPU culling init failed ({}), falling back to CPU culling", .{err}); + } + renderer.* = .{ .allocator = allocator, .storage = storage, @@ -88,8 +110,15 @@ pub const WorldRenderer = struct { .draw_commands = .empty, .instance_buffers = instance_buffers, .indirect_buffers = indirect_buffers, + .culling_system = culling_system, + .aabb_data = .empty, + .chunk_lookup = undefined, + .gpu_visible_indices = .empty, + .use_gpu_culling = use_gpu, }; + for (&renderer.chunk_lookup) |*lookup| lookup.* = .empty; + return renderer; } @@ -98,16 +127,15 @@ pub const WorldRenderer = struct { self.vertex_allocator.tick(self.query.getFrameIndex()); } - /// Reset shadow statistics before a new frame begins. - /// - /// `last_shadow_stats` then accumulates across all shadow passes in that frame. - /// If per-cascade statistics are needed, call this before each shadow pass. pub fn resetShadowStats(self: *WorldRenderer) void { self.last_shadow_stats = .{}; } pub fn deinit(self: *WorldRenderer) void { self.visible_chunks.deinit(self.allocator); + self.aabb_data.deinit(self.allocator); + for (&self.chunk_lookup) |*lookup| lookup.deinit(self.allocator); + self.gpu_visible_indices.deinit(self.allocator); for (0..rhi_mod.MAX_FRAMES_IN_FLIGHT) |i| { if (self.instance_buffers[i] != 0) self.rm.destroyBuffer(self.instance_buffers[i]); @@ -116,13 +144,15 @@ pub const WorldRenderer = struct { self.instance_data.deinit(self.allocator); self.draw_commands.deinit(self.allocator); + if (self.culling_system) |cs| cs.deinit(); + self.vertex_allocator.deinit(); self.allocator.destroy(self.vertex_allocator); self.allocator.destroy(self); } pub fn render(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, render_distance: i32, lod_manager: ?*LODManager, render_lod: bool) void { - self.last_render_stats = .{}; + self.last_render_stats = .{ .gpu_culling = self.use_gpu_culling }; self.storage.chunks_mutex.lockShared(); defer self.storage.chunks_mutex.unlockShared(); @@ -137,8 +167,6 @@ pub const WorldRenderer = struct { self.instance_data.clearRetainingCapacity(); self.draw_commands.clearRetainingCapacity(); - const frustum = Frustum.fromViewProj(view_proj); - if (!std.math.isFinite(camera_pos.x) or !std.math.isFinite(camera_pos.z)) return; const world_x: i64 = @intFromFloat(camera_pos.x); @@ -149,22 +177,10 @@ pub const WorldRenderer = struct { const r_dist_val: i32 = if (lod_manager) |mgr| @min(render_distance, @as(i32, @intCast(mgr.config.getRadii()[0]))) else render_distance; const r_dist: i64 = @as(i64, @intCast(r_dist_val)); - var cz = pc_z - r_dist; - while (cz <= pc_z + r_dist) : (cz += 1) { - var cx = pc_x - r_dist; - while (cx <= pc_x + r_dist) : (cx += 1) { - if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| { - if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) { - if (frustum.intersectsChunkRelative(@as(i32, @intCast(cx)), @as(i32, @intCast(cz)), camera_pos.x, camera_pos.y, camera_pos.z)) { - self.visible_chunks.append(self.allocator, data) catch |err| { - log.log.debug("MDI: visible_chunks append failed: {}", .{err}); - }; - } else { - self.last_render_stats.chunks_culled += 1; - } - } - } - } + if (self.use_gpu_culling) { + self.renderGpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist); + } else { + self.renderCpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist); } self.last_render_stats.chunks_total = @intCast(self.storage.chunks.count()); @@ -259,8 +275,93 @@ pub const WorldRenderer = struct { } } - /// Intentionally excludes visual LOD meshes to prevent LOD offset/morphing - /// artifacts from corrupting shadow maps. Only real chunk geometry is rendered. + fn renderCpuCull(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, pc_x: i64, pc_z: i64, r_dist: i64) void { + const frustum = Frustum.fromViewProj(view_proj); + + var cz = pc_z - r_dist; + while (cz <= pc_z + r_dist) : (cz += 1) { + var cx = pc_x - r_dist; + while (cx <= pc_x + r_dist) : (cx += 1) { + if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| { + if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) { + if (frustum.intersectsChunkRelative(@as(i32, @intCast(cx)), @as(i32, @intCast(cz)), camera_pos.x, camera_pos.y, camera_pos.z)) { + self.visible_chunks.append(self.allocator, data) catch |err| { + log.log.debug("MDI: visible_chunks append failed: {}", .{err}); + }; + } else { + self.last_render_stats.chunks_culled += 1; + } + } + } + } + } + } + + fn chunkAABB(chunk_x: i32, chunk_z: i32, camera_pos: Vec3) ChunkCullData { + const world_x: f32 = @floatFromInt(chunk_x * CHUNK_SIZE_X); + const world_z: f32 = @floatFromInt(chunk_z * CHUNK_SIZE_Z); + return .{ + .min_point = .{ world_x - camera_pos.x, -camera_pos.y, world_z - camera_pos.z, 0.0 }, + .max_point = .{ + world_x - camera_pos.x + @as(f32, @floatFromInt(CHUNK_SIZE_X)), + -camera_pos.y + @as(f32, @floatFromInt(CHUNK_SIZE_Y)), + world_z - camera_pos.z + @as(f32, @floatFromInt(CHUNK_SIZE_Z)), + 0.0, + }, + }; + } + + fn renderGpuCull(self: *WorldRenderer, view_proj: Mat4, camera_pos: Vec3, pc_x: i64, pc_z: i64, r_dist: i64) void { + const cs = self.culling_system orelse { + log.log.err("GPU culling enabled but system is null, falling back to CPU", .{}); + self.use_gpu_culling = false; + return self.renderCpuCull(view_proj, camera_pos, pc_x, pc_z, r_dist); + }; + + const fi = self.query.getFrameIndex(); + const prev_fi = (fi + rhi_mod.MAX_FRAMES_IN_FLIGHT - 1) % rhi_mod.MAX_FRAMES_IN_FLIGHT; + + const prev_visible_count = cs.readVisibleCount(prev_fi); + self.gpu_visible_indices.clearRetainingCapacity(); + if (prev_visible_count > 0) { + self.gpu_visible_indices.resize(self.allocator, prev_visible_count) catch return; + cs.readVisibleIndices(prev_fi, prev_visible_count, self.gpu_visible_indices.items); + + const limit = @min(@as(usize, @intCast(prev_visible_count)), self.gpu_visible_indices.items.len); + for (self.gpu_visible_indices.items[0..limit]) |idx| { + if (idx < self.chunk_lookup[prev_fi].items.len) { + self.visible_chunks.append(self.allocator, self.chunk_lookup[prev_fi].items[idx]) catch continue; + } + } + } + + const prev_rendered: u32 = @intCast(self.visible_chunks.items.len); + + self.aabb_data.clearRetainingCapacity(); + self.chunk_lookup[fi].clearRetainingCapacity(); + + var cz = pc_z - r_dist; + while (cz <= pc_z + r_dist) : (cz += 1) { + var cx = pc_x - r_dist; + while (cx <= pc_x + r_dist) : (cx += 1) { + if (self.storage.chunks.get(.{ .x = @as(i32, @intCast(cx)), .z = @as(i32, @intCast(cz)) })) |data| { + if (data.chunk.state == .renderable or data.mesh.solid_allocation != null or data.mesh.cutout_allocation != null or data.mesh.fluid_allocation != null) { + self.aabb_data.append(self.allocator, chunkAABB(data.chunk.chunk_x, data.chunk.chunk_z, camera_pos)) catch continue; + self.chunk_lookup[fi].append(self.allocator, data) catch continue; + } + } + } + } + + const chunk_count: u32 = @intCast(self.aabb_data.items.len); + if (chunk_count == 0) return; + + self.last_render_stats.chunks_culled += chunk_count - @min(prev_rendered, chunk_count); + + cs.updateAABBData(fi, self.aabb_data.items); + cs.dispatch(view_proj, chunk_count); + } + pub fn renderShadowPass(self: *WorldRenderer, light_space_matrix: Mat4, camera_pos: Vec3, shadow_caster_distance: f32) void { const frustum = Frustum.fromViewProj(light_space_matrix);