From 34f8ae05c75015adaffd85b3a127528fd4f0bf9b Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Tue, 14 Oct 2025 22:25:09 -0400 Subject: [PATCH 1/7] updated zon file to 0.15 --- build.zig.zon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.zig.zon b/build.zig.zon index 7b6432d..fdc53d5 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -2,7 +2,7 @@ .name = "katana", .version = "0.1.0", - .minimum_zig_version = "0.13.0", + .minimum_zig_version = "0.15.0", .paths = .{ // Include all source files From cbe375a4ad2fd93cae9bcb9ee0592d3d44f25622 Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:11:43 -0400 Subject: [PATCH 2/7] updated build file for 0.15 --- build.zig | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/build.zig b/build.zig index 1a9423b..75d4195 100644 --- a/build.zig +++ b/build.zig @@ -24,9 +24,11 @@ pub fn build(b: *std.Build) void { // Tests const tests = b.addTest(.{ - .root_source_file = b.path("src/tests.zig"), - .target = target, - .optimize = .ReleaseSafe, // ReleaseSafe for tests + .root_module = b.createModule(.{ + .root_source_file = b.path("src/tests.zig"), + .target = target, + .optimize = .ReleaseSafe, + }), }); const run_tests = b.addRunArtifact(tests); @@ -42,11 +44,12 @@ pub fn build(b: *std.Build) void { // Benchmarks const bench = b.addExecutable(.{ .name = "bench", - .root_source_file = b.path("src/bench.zig"), - .target = target, - .optimize = .ReleaseFast, // ReleaseFast for benchmarks + .root_module = b.createModule(.{ + .root_source_file = b.path("src/bench.zig"), + .target = target, + .optimize = .ReleaseFast, + }), }); - const run_bench = b.addRunArtifact(bench); // Create a step for running the benchmarks From e419645532fd04622f98122de2468c394ed44f82 Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:49:14 -0400 Subject: [PATCH 3/7] updated zon file from 0.13 to 0.15.2 and fingerprint --- build.zig.zon | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build.zig.zon b/build.zig.zon index fdc53d5..d1b2370 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -1,8 +1,9 @@ .{ - .name = "katana", + .name = .katana, + .fingerprint = 0x1b2081fce0df035f, .version = "0.1.0", - .minimum_zig_version = "0.15.0", + .minimum_zig_version = "0.15.2", .paths = .{ // Include all source files From d3bcf2d6c130dd06219f852e2b02be6d38ca1411 Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:52:30 -0400 Subject: [PATCH 4/7] added standarized explicit buffer --- src/bench.zig | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/bench.zig b/src/bench.zig index 1242f12..86d2eea 100644 --- a/src/bench.zig +++ b/src/bench.zig @@ -15,7 +15,7 @@ pub fn calculateGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usiz defer b.deinit(); // Initialize with random data - var prng = std.rand.DefaultPrng.init(0); + var prng = std.Random.DefaultPrng.init(0); var random = prng.random(); for (a.data) |*val| val.* = random.float(f32); for (b.data) |*val| val.* = random.float(f32); @@ -53,7 +53,14 @@ pub fn main() !void { defer arena.deinit(); const allocator = arena.allocator(); - // Define test sizes + // 1. Define an explicit buffer for the stdout writer + var stdout_buffer: [1024]u8 = undefined; + + // 2. Get the file handle and create the buffered writer + var stdout_writer_wrapper = std.fs.File.stdout().writer(&stdout_buffer); + const stdout = &stdout_writer_wrapper.interface; + + //Define test sizes const sizes = [_]struct { m: usize, n: usize, k: usize }{ .{ .m = 256, .n = 256, .k = 256 }, .{ .m = 512, .n = 512, .k = 512 }, @@ -68,12 +75,16 @@ pub fn main() !void { const iterations = 5; - try std.io.getStdOut().writer().print("\nRunning MatMul Benchmark\n", .{}); - try std.io.getStdOut().writer().print("T = {d} \n", .{T}); - try std.io.getStdOut().writer().print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()}); + // 3. Use the new 'stdout' Writer interface pointer for printing + try stdout.print("\nRunning MatMul Benchmark\n", .{}); + try stdout.print("T = {d} \n", .{T}); + try stdout.print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()}); for (sizes) |size| { const avg_gflops = try calculateGflops(allocator, size.m, size.n, size.k, iterations); - try std.io.getStdOut().writer().print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops }); + try stdout.print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops }); } + + // 4. IMPORTANT: Flush the buffer to ensure all output is written to the terminal + try stdout.flush(); } From 61d1d13693e78f61ef21360e361346c6c2e02c92 Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:55:25 -0400 Subject: [PATCH 5/7] made actual variable of memory alignment to be 32-bits --- src/ops.zig | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/ops.zig b/src/ops.zig index 4b47968..8ddcc15 100644 --- a/src/ops.zig +++ b/src/ops.zig @@ -87,7 +87,8 @@ pub fn transpose(comptime T: type, tensor: *Tensor(T)) !void { const rows = tensor.shape[0]; const cols = tensor.shape[1]; - var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), 32, rows * cols); + const alignment: std.mem.Alignment = .@"32"; + var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), alignment, rows * cols); for (0..rows) |i| { for (0..cols) |j| { @@ -161,7 +162,8 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us } // Allocate memory for transposed data - var new_data = try tensor.allocator.alignedAlloc(T, 32, tensor.data.len); + const alignment: std.mem.Alignment = .@"32"; + var new_data = try tensor.allocator.alignedAlloc(T, alignment, tensor.data.len); errdefer tensor.allocator.free(new_data); // Calculate new strides @@ -177,7 +179,7 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us // Create coordinate arrays var coords = try tensor.allocator.alloc(usize, tensor.shape.len); defer tensor.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); // Perform the transpose operation const total_elements = tensor.data.len; @@ -334,7 +336,7 @@ pub fn getChunk(comptime T: type, tensor: Tensor(T), dim: usize, chunk_idx: usiz var result_idx: usize = 0; var coords = try tensor.allocator.alloc(usize, tensor.shape.len); defer tensor.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); while (result_idx < total_elements) : (result_idx += 1) { // Calculate source coordinates @@ -446,7 +448,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize) if (first_size > 0) { var coords = try tensor.allocator.alloc(usize, tensor.shape.len); defer tensor.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); var idx: usize = 0; while (idx < first_size) : (idx += 1) { @@ -494,7 +496,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize) if (second_size > 0) { var coords = try tensor.allocator.alloc(usize, other.shape.len); defer tensor.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); var idx: usize = 0; while (idx < second_size) : (idx += 1) { @@ -644,7 +646,7 @@ pub fn stack(comptime T: type, tensors: []const Tensor(T), dim: usize) !Tensor(T // Copy data from each input tensor var coords = try ref_tensor.allocator.alloc(usize, result.shape.len); defer ref_tensor.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); const elements_per_tensor = calculateSize(ref_shape); @@ -809,7 +811,7 @@ pub fn randomTensor(comptime T: type, allocator: std.mem.Allocator, shape: []con var tensor = try Tensor(T).init(allocator, shape); errdefer tensor.deinit(); - var rng = std.rand.DefaultPrng.init(seed); + var rng = std.Random.DefaultPrng.init(seed); for (tensor.data) |*val| { val.* = rng.random().float(T) * 2.0 - 1.0; // Values between -1 and 1 } @@ -850,14 +852,14 @@ pub fn zeros(comptime T: type, allocator: Allocator, shape: []const usize) !Tens } // Allocate aligned data array - const alignment = 32; + const alignment: std.mem.Alignment = .@"32"; const data = try allocator.alignedAlloc(T, alignment, total_size); // Initialize all elements to zero - @memset(data, 0); + @memset(data[0..], 0); // Create tensor shape const tensor_shape = try allocator.alloc(usize, shape.len); - @memcpy(tensor_shape, shape); + @memcpy(tensor_shape[0..], shape); // Return initialized tensor return Tensor(T){ @@ -954,7 +956,7 @@ pub fn getStabilityInfo(comptime T: type, tensor: Tensor(T)) !Tensor(T).Stabilit var info = Tensor(@TypeOf(tensor.data[0])).StabilityInfo{}; switch (@typeInfo(@TypeOf(tensor.data[0]))) { - .Float => { + .float => { for (tensor.data, 0..) |value, i| { if (std.math.isNan(value)) { info.has_nan = true; @@ -1054,7 +1056,7 @@ pub fn hasInf(comptime T: type, tensor: Tensor(T)) !bool { /// ``` pub fn replaceUnstable(comptime T: type, tensor: *Tensor(T), replacement: T) !void { switch (@typeInfo(@TypeOf(tensor.data[0]))) { - .Float => { + .float => { for (tensor.data) |*value| { if (std.math.isNan(value.*) or std.math.isInf(value.*)) { value.* = replacement; @@ -1377,7 +1379,7 @@ pub fn broadcast_multiply(comptime T: type, a: *Tensor(T), b: Tensor(T)) !void { } // Copy result back to a - @memcpy(a.data, result.data); + @memcpy(a.data[0..], result.data); } /// Helper function for broadcasting subtraction. @@ -1479,7 +1481,7 @@ pub fn matmul(comptime T: type, a: Tensor(T), b: Tensor(T), allocator: Allocator errdefer result.deinit(); // Initialize result to zero - @memset(result.data, 0); + @memset(result.data[0..], 0); // Simple triple-loop matrix multiplication for (0..M) |i| { @@ -1505,7 +1507,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten errdefer result.deinit(); // Initialize result to zero - @memset(result.data, 0); + @memset(result.data[0..], 0); // Calculate tile grid dimensions const tiles_M = (M + Tile - 1) / Tile; @@ -1520,7 +1522,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten // Create thread pool var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads); - defer thread_pool.deinit(); + defer thread_pool.deinit(allocator); // Create thread context const context = ThreadContext{ @@ -1544,7 +1546,8 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten }; for (0..num_threads) |_| { - try thread_pool.append(try std.Thread.spawn(.{}, WorkerFn.worker, .{context})); + const thread = try std.Thread.spawn(.{}, WorkerFn.worker, .{context}); + try thread_pool.append(allocator, thread); } // Wait for all threads to complete @@ -2009,7 +2012,7 @@ fn softmax(comptime T: type, tensor: *Tensor(T), dim: usize) !void { /// Note: /// - If the array is empty, the behavior of this function is undefined. pub fn gelu(comptime T: type, tensor: *Tensor(T)) !void { - if (@typeInfo(T) != .Float) { + if (@typeInfo(T) != .float) { @compileError("GELU operation requires floating-point tensor"); } From 144ab6e0b59df44aac6500579336a2060b5ac199 Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:56:18 -0400 Subject: [PATCH 6/7] added actual lengths to memcpy and memset --- src/tensor.zig | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/tensor.zig b/src/tensor.zig index 74d14c5..dd6e7eb 100644 --- a/src/tensor.zig +++ b/src/tensor.zig @@ -44,15 +44,16 @@ pub fn Tensor(comptime DataType: type) type { } } const shape_copy = try allocator.alloc(usize, shape.len); - @memcpy(shape_copy, shape); + @memcpy(shape_copy[0..], shape); // Now we know size fits in usize const final_size: usize = @intCast(size); - const data = try allocator.alignedAlloc(DataType, 32, final_size); + const alignment: std.mem.Alignment = .@"32"; + const data = try allocator.alignedAlloc(DataType, alignment, final_size); if (DataType == bool) { - @memset(data, false); + @memset(data[0..], false); } else { - @memset(data, 0); + @memset(data[0..], 0); } const self = Tensor(DataType){ @@ -163,7 +164,7 @@ pub fn Tensor(comptime DataType: type) type { // Update shape const new_shape_copy = try self.allocator.alloc(usize, new_shape.len); - @memcpy(new_shape_copy, new_shape); + @memcpy(new_shape_copy[0..], new_shape); self.allocator.free(self.shape); self.shape = new_shape_copy; @@ -338,14 +339,14 @@ pub fn Tensor(comptime DataType: type) type { // Create coordinate arrays var src_coords = try self.allocator.alloc(usize, self.shape.len); defer self.allocator.free(src_coords); - @memset(src_coords, 0); + @memset(src_coords[0..], 0); // Set the fixed dimension to the specified index src_coords[dim] = index; var dst_coords = try self.allocator.alloc(usize, result.shape.len); defer self.allocator.free(dst_coords); - @memset(dst_coords, 0); + @memset(dst_coords[0..], 0); // Copy data const total_elements = calculateSize(result.shape); @@ -444,7 +445,7 @@ pub fn Tensor(comptime DataType: type) type { // Copy data with proper indexing var coords = try self.allocator.alloc(usize, self.shape.len); defer self.allocator.free(coords); - @memset(coords, 0); + @memset(coords[0..], 0); var result_idx: usize = 0; while (true) { @@ -568,7 +569,7 @@ pub fn Tensor(comptime DataType: type) type { /// - An error if the tensor cannot be copied for some reason. pub fn copy(self: Self) !Self { const new_tensor = try Self.init(self.allocator, self.shape); - @memcpy(new_tensor.data, self.data); + @memcpy(new_tensor.data[0..], self.data); return new_tensor; } From ccd0c69dc2d9593f90ae526f1b048b031b014b4a Mon Sep 17 00:00:00 2001 From: Gaiden-Spence Date: Sat, 18 Oct 2025 23:57:51 -0400 Subject: [PATCH 7/7] added actual lengths to the memset and changed enum value to .float --- src/tests.zig | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/tests.zig b/src/tests.zig index 57e0c3a..74ec05c 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -413,7 +413,7 @@ test "complex transpose operations" { 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, }; - @memcpy(tensor.data, &pattern); + @memcpy(tensor.data[0..], &pattern); try ops.transpose(f32, &tensor); @@ -460,7 +460,7 @@ test "complex transpose operations" { 4, 5, 6, 7, 8, 9, }; - @memcpy(tensor.data, &pattern); + @memcpy(tensor.data[0..], &pattern); try ops.transpose(f32, &tensor); @@ -479,7 +479,7 @@ test "complex transpose operations" { defer row_tensor.deinit(); const row_data = [_]f32{ 1, 2, 3, 4, 5 }; - @memcpy(row_tensor.data, &row_data); + @memcpy(row_tensor.data[0..], &row_data); try ops.transpose(f32, &row_tensor); try expectEqual(@as(usize, 5), row_tensor.shape[0]); @@ -490,7 +490,7 @@ test "complex transpose operations" { defer col_tensor.deinit(); const col_data = [_]f32{ 1, 2, 3, 4, 5 }; - @memcpy(col_tensor.data, &col_data); + @memcpy(col_tensor.data[0..], &col_data); try ops.transpose(f32, &col_tensor); try expectEqual(@as(usize, 1), col_tensor.shape[0]); @@ -1508,11 +1508,11 @@ test "layerNorm basic functionality" { // Create weight and bias tensors var weight = try Tensor(f32).init(allocator, &[_]usize{3}); defer weight.deinit(); - @memset(weight.data, 1.0); // Scale factor of 1 + @memset(weight.data[0..], 1.0); // Scale factor of 1 var bias = try Tensor(f32).init(allocator, &[_]usize{3}); defer bias.deinit(); - @memset(bias.data, 0.0); // No bias + @memset(bias.data[0..], 0.0); // No bias // Apply layer normalization var result = try ops.layerNorm(f32, input, weight, bias, 1e-5); @@ -1552,11 +1552,11 @@ test "layerNorm stability checks" { var weight = try Tensor(f32).init(allocator, &[_]usize{3}); defer weight.deinit(); - @memset(weight.data, 1.0); + @memset(weight.data[0..], 1.0); var bias = try Tensor(f32).init(allocator, &[_]usize{3}); defer bias.deinit(); - @memset(bias.data, 0.0); + @memset(bias.data[0..], 0.0); try testing.expectError(error.HasNaN, ops.layerNorm(f32, input, weight, bias, 1e-5)); } @@ -1565,15 +1565,15 @@ test "layerNorm stability checks" { { var input = try Tensor(f32).init(allocator, &[_]usize{ 2, 3 }); defer input.deinit(); - @memset(input.data, 1.0); // All same values -> zero variance + @memset(input.data[0..], 1.0); // All same values -> zero variance var weight = try Tensor(f32).init(allocator, &[_]usize{3}); defer weight.deinit(); - @memset(weight.data, 1.0); + @memset(weight.data[0..], 1.0); var bias = try Tensor(f32).init(allocator, &[_]usize{3}); defer bias.deinit(); - @memset(bias.data, 0.0); + @memset(bias.data[0..], 0.0); var result = try ops.layerNorm(f32, input, weight, bias, 1e-5); defer result.deinit(); @@ -1586,15 +1586,15 @@ test "layerNorm stability checks" { { var input = try Tensor(f32).init(allocator, &[_]usize{ 2, 3 }); defer input.deinit(); - @memset(input.data, 1.0); + @memset(input.data[0..], 1.0); var weight = try Tensor(f32).init(allocator, &[_]usize{3}); defer weight.deinit(); - @memset(weight.data, 1.0); + @memset(weight.data[0..], 1.0); var bias = try Tensor(f32).init(allocator, &[_]usize{3}); defer bias.deinit(); - @memset(bias.data, 0.0); + @memset(bias.data[0..], 0.0); try testing.expectError(error.InvalidEpsilon, ops.layerNorm(f32, input, weight, bias, -1e-5)); } @@ -2425,7 +2425,7 @@ test "argmax with empty tensor" { // Helper function to create and fill a tensor with test data fn createTestTensor(comptime T: type, allocator: std.mem.Allocator, shape: []const usize, data: []const T) !Tensor(T) { const tensor = try Tensor(T).init(allocator, shape); - @memcpy(tensor.data, data); + @memcpy(tensor.data[0..], data); return tensor; } @@ -2438,7 +2438,7 @@ pub fn compareTensors(comptime T: type, expected: Tensor(T), actual: Tensor(T)) for (expected.data, actual.data, 0..) |exp, act, i| { switch (@typeInfo(T)) { - .Float => { + .float => { // Handle special values if (std.math.isNan(exp)) { if (!std.math.isNan(act)) { @@ -2920,8 +2920,8 @@ test "matmul numerical stability" { // Using smaller values to avoid overflow const large: f32 = 1e3; - @memset(a.data, large); - @memset(b.data, large); + @memset(a.data[0..], large); + @memset(b.data[0..], large); var result = try ops.matmul(f32, a, b, allocator); defer result.deinit(); @@ -2944,8 +2944,8 @@ test "matmul numerical stability" { defer b.deinit(); const small: f32 = 1e-3; - @memset(a.data, small); - @memset(b.data, small); + @memset(a.data[0..], small); + @memset(b.data[0..], small); var result = try ops.matmul(f32, a, b, allocator); defer result.deinit();