From 34f8ae05c75015adaffd85b3a127528fd4f0bf9b Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Tue, 14 Oct 2025 22:25:09 -0400
Subject: [PATCH 1/7] updated zon file to 0.15

---
 build.zig.zon | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.zig.zon b/build.zig.zon
index 7b6432d..fdc53d5 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -2,7 +2,7 @@
     .name = "katana",
     .version = "0.1.0",
 
-    .minimum_zig_version = "0.13.0",
+    .minimum_zig_version = "0.15.0",
 
     .paths = .{
         // Include all source files

From cbe375a4ad2fd93cae9bcb9ee0592d3d44f25622 Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:11:43 -0400
Subject: [PATCH 2/7] updated build file for 0.15

---
 build.zig | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/build.zig b/build.zig
index 1a9423b..75d4195 100644
--- a/build.zig
+++ b/build.zig
@@ -24,9 +24,11 @@ pub fn build(b: *std.Build) void {
 
     // Tests
     const tests = b.addTest(.{
-        .root_source_file = b.path("src/tests.zig"),
-        .target = target,
-        .optimize = .ReleaseSafe, // ReleaseSafe for tests
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/tests.zig"),
+            .target = target,
+            .optimize = .ReleaseSafe,
+        }),
     });
 
     const run_tests = b.addRunArtifact(tests);
@@ -42,11 +44,12 @@ pub fn build(b: *std.Build) void {
     // Benchmarks
     const bench = b.addExecutable(.{
         .name = "bench",
-        .root_source_file = b.path("src/bench.zig"),
-        .target = target,
-        .optimize = .ReleaseFast, // ReleaseFast for benchmarks
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/bench.zig"),
+            .target = target,
+            .optimize = .ReleaseFast,
+        }),
     });
-
     const run_bench = b.addRunArtifact(bench);
 
     // Create a step for running the benchmarks

From e419645532fd04622f98122de2468c394ed44f82 Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:49:14 -0400
Subject: [PATCH 3/7] updated zon file from 0.13 to 0.15.2 and fingerprint

---
 build.zig.zon | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/build.zig.zon b/build.zig.zon
index fdc53d5..d1b2370 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -1,8 +1,9 @@
 .{
-    .name = "katana",
+    .name = .katana,
+    .fingerprint = 0x1b2081fce0df035f,
     .version = "0.1.0",
 
-    .minimum_zig_version = "0.15.0",
+    .minimum_zig_version = "0.15.2",
 
     .paths = .{
         // Include all source files

From d3bcf2d6c130dd06219f852e2b02be6d38ca1411 Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:52:30 -0400
Subject: [PATCH 4/7] added standarized explicit buffer

---
 src/bench.zig | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/bench.zig b/src/bench.zig
index 1242f12..86d2eea 100644
--- a/src/bench.zig
+++ b/src/bench.zig
@@ -15,7 +15,7 @@ pub fn calculateGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usiz
     defer b.deinit();
 
     // Initialize with random data
-    var prng = std.rand.DefaultPrng.init(0);
+    var prng = std.Random.DefaultPrng.init(0);
     var random = prng.random();
     for (a.data) |*val| val.* = random.float(f32);
     for (b.data) |*val| val.* = random.float(f32);
@@ -53,7 +53,14 @@ pub fn main() !void {
     defer arena.deinit();
     const allocator = arena.allocator();
 
-    // Define test sizes
+    // 1. Define an explicit buffer for the stdout writer
+    var stdout_buffer: [1024]u8 = undefined;
+
+    // 2. Get the file handle and create the buffered writer
+    var stdout_writer_wrapper = std.fs.File.stdout().writer(&stdout_buffer);
+    const stdout = &stdout_writer_wrapper.interface;
+
+    //Define test sizes
     const sizes = [_]struct { m: usize, n: usize, k: usize }{
         .{ .m = 256, .n = 256, .k = 256 },
         .{ .m = 512, .n = 512, .k = 512 },
@@ -68,12 +75,16 @@ pub fn main() !void {
 
     const iterations = 5;
 
-    try std.io.getStdOut().writer().print("\nRunning MatMul Benchmark\n", .{});
-    try std.io.getStdOut().writer().print("T = {d} \n", .{T});
-    try std.io.getStdOut().writer().print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});
+    // 3. Use the new 'stdout' Writer interface pointer for printing
+    try stdout.print("\nRunning MatMul Benchmark\n", .{});
+    try stdout.print("T = {d} \n", .{T});
+    try stdout.print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});
 
     for (sizes) |size| {
         const avg_gflops = try calculateGflops(allocator, size.m, size.n, size.k, iterations);
-        try std.io.getStdOut().writer().print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
+        try stdout.print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
     }
+
+    // 4. IMPORTANT: Flush the buffer to ensure all output is written to the terminal
+    try stdout.flush();
 }

From 61d1d13693e78f61ef21360e361346c6c2e02c92 Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:55:25 -0400
Subject: [PATCH 5/7] made actual variable of memory alignment to be 32-bits

---
 src/ops.zig | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/ops.zig b/src/ops.zig
index 4b47968..8ddcc15 100644
--- a/src/ops.zig
+++ b/src/ops.zig
@@ -87,7 +87,8 @@ pub fn transpose(comptime T: type, tensor: *Tensor(T)) !void {
 
     const rows = tensor.shape[0];
     const cols = tensor.shape[1];
-    var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), 32, rows * cols);
+    const alignment: std.mem.Alignment = .@"32";
+    var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), alignment, rows * cols);
 
     for (0..rows) |i| {
         for (0..cols) |j| {
@@ -161,7 +162,8 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
     }
 
     // Allocate memory for transposed data
-    var new_data = try tensor.allocator.alignedAlloc(T, 32, tensor.data.len);
+    const alignment: std.mem.Alignment = .@"32";
+    var new_data = try tensor.allocator.alignedAlloc(T, alignment, tensor.data.len);
     errdefer tensor.allocator.free(new_data);
 
     // Calculate new strides
@@ -177,7 +179,7 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
     // Create coordinate arrays
     var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
     defer tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     // Perform the transpose operation
     const total_elements = tensor.data.len;
@@ -334,7 +336,7 @@ pub fn getChunk(comptime T: type, tensor: Tensor(T), dim: usize, chunk_idx: usiz
     var result_idx: usize = 0;
     var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
     defer tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     while (result_idx < total_elements) : (result_idx += 1) {
         // Calculate source coordinates
@@ -446,7 +448,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
     if (first_size > 0) {
         var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
         defer tensor.allocator.free(coords);
-        @memset(coords, 0);
+        @memset(coords[0..], 0);
 
         var idx: usize = 0;
         while (idx < first_size) : (idx += 1) {
@@ -494,7 +496,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
     if (second_size > 0) {
         var coords = try tensor.allocator.alloc(usize, other.shape.len);
         defer tensor.allocator.free(coords);
-        @memset(coords, 0);
+        @memset(coords[0..], 0);
 
         var idx: usize = 0;
         while (idx < second_size) : (idx += 1) {
@@ -644,7 +646,7 @@ pub fn stack(comptime T: type, tensors: []const Tensor(T), dim: usize) !Tensor(T
     // Copy data from each input tensor
     var coords = try ref_tensor.allocator.alloc(usize, result.shape.len);
     defer ref_tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     const elements_per_tensor = calculateSize(ref_shape);
 
@@ -809,7 +811,7 @@ pub fn randomTensor(comptime T: type, allocator: std.mem.Allocator, shape: []con
     var tensor = try Tensor(T).init(allocator, shape);
     errdefer tensor.deinit();
 
-    var rng = std.rand.DefaultPrng.init(seed);
+    var rng = std.Random.DefaultPrng.init(seed);
     for (tensor.data) |*val| {
         val.* = rng.random().float(T) * 2.0 - 1.0; // Values between -1 and 1
     }
@@ -850,14 +852,14 @@ pub fn zeros(comptime T: type, allocator: Allocator, shape: []const usize) !Tens
     }
 
     // Allocate aligned data array
-    const alignment = 32;
+    const alignment: std.mem.Alignment = .@"32";
     const data = try allocator.alignedAlloc(T, alignment, total_size);
     // Initialize all elements to zero
-    @memset(data, 0);
+    @memset(data[0..], 0);
 
     // Create tensor shape
     const tensor_shape = try allocator.alloc(usize, shape.len);
-    @memcpy(tensor_shape, shape);
+    @memcpy(tensor_shape[0..], shape);
 
     // Return initialized tensor
     return Tensor(T){
@@ -954,7 +956,7 @@ pub fn getStabilityInfo(comptime T: type, tensor: Tensor(T)) !Tensor(T).Stabilit
     var info = Tensor(@TypeOf(tensor.data[0])).StabilityInfo{};
 
     switch (@typeInfo(@TypeOf(tensor.data[0]))) {
-        .Float => {
+        .float => {
             for (tensor.data, 0..) |value, i| {
                 if (std.math.isNan(value)) {
                     info.has_nan = true;
@@ -1054,7 +1056,7 @@ pub fn hasInf(comptime T: type, tensor: Tensor(T)) !bool {
 /// ```
 pub fn replaceUnstable(comptime T: type, tensor: *Tensor(T), replacement: T) !void {
     switch (@typeInfo(@TypeOf(tensor.data[0]))) {
-        .Float => {
+        .float => {
             for (tensor.data) |*value| {
                 if (std.math.isNan(value.*) or std.math.isInf(value.*)) {
                     value.* = replacement;
@@ -1377,7 +1379,7 @@ pub fn broadcast_multiply(comptime T: type, a: *Tensor(T), b: Tensor(T)) !void {
     }
 
     // Copy result back to a
-    @memcpy(a.data, result.data);
+    @memcpy(a.data[0..], result.data);
 }
 
 /// Helper function for broadcasting subtraction.
@@ -1479,7 +1481,7 @@ pub fn matmul(comptime T: type, a: Tensor(T), b: Tensor(T), allocator: Allocator
     errdefer result.deinit();
 
     // Initialize result to zero
-    @memset(result.data, 0);
+    @memset(result.data[0..], 0);
 
     // Simple triple-loop matrix multiplication
     for (0..M) |i| {
@@ -1505,7 +1507,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
     errdefer result.deinit();
 
     // Initialize result to zero
-    @memset(result.data, 0);
+    @memset(result.data[0..], 0);
 
     // Calculate tile grid dimensions
     const tiles_M = (M + Tile - 1) / Tile;
@@ -1520,7 +1522,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
 
     // Create thread pool
     var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
-    defer thread_pool.deinit();
+    defer thread_pool.deinit(allocator);
 
     // Create thread context
     const context = ThreadContext{
@@ -1544,7 +1546,8 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
     };
 
     for (0..num_threads) |_| {
-        try thread_pool.append(try std.Thread.spawn(.{}, WorkerFn.worker, .{context}));
+        const thread = try std.Thread.spawn(.{}, WorkerFn.worker, .{context});
+        try thread_pool.append(allocator, thread);
     }
 
     // Wait for all threads to complete
@@ -2009,7 +2012,7 @@ fn softmax(comptime T: type, tensor: *Tensor(T), dim: usize) !void {
 /// Note:
 /// - If the array is empty, the behavior of this function is undefined.
 pub fn gelu(comptime T: type, tensor: *Tensor(T)) !void {
-    if (@typeInfo(T) != .Float) {
+    if (@typeInfo(T) != .float) {
         @compileError("GELU operation requires floating-point tensor");
     }
 

From 144ab6e0b59df44aac6500579336a2060b5ac199 Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:56:18 -0400
Subject: [PATCH 6/7] added actual lengths to memcpy and memset

---
 src/tensor.zig | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/tensor.zig b/src/tensor.zig
index 74d14c5..dd6e7eb 100644
--- a/src/tensor.zig
+++ b/src/tensor.zig
@@ -44,15 +44,16 @@ pub fn Tensor(comptime DataType: type) type {
                 }
             }
             const shape_copy = try allocator.alloc(usize, shape.len);
-            @memcpy(shape_copy, shape);
+            @memcpy(shape_copy[0..], shape);
 
             // Now we know size fits in usize
             const final_size: usize = @intCast(size);
-            const data = try allocator.alignedAlloc(DataType, 32, final_size);
+            const alignment: std.mem.Alignment = .@"32";
+            const data = try allocator.alignedAlloc(DataType, alignment, final_size);
             if (DataType == bool) {
-                @memset(data, false);
+                @memset(data[0..], false);
             } else {
-                @memset(data, 0);
+                @memset(data[0..], 0);
             }
 
             const self = Tensor(DataType){
@@ -163,7 +164,7 @@ pub fn Tensor(comptime DataType: type) type {
 
             // Update shape
             const new_shape_copy = try self.allocator.alloc(usize, new_shape.len);
-            @memcpy(new_shape_copy, new_shape);
+            @memcpy(new_shape_copy[0..], new_shape);
 
             self.allocator.free(self.shape);
             self.shape = new_shape_copy;
@@ -338,14 +339,14 @@ pub fn Tensor(comptime DataType: type) type {
                 // Create coordinate arrays
                 var src_coords = try self.allocator.alloc(usize, self.shape.len);
                 defer self.allocator.free(src_coords);
-                @memset(src_coords, 0);
+                @memset(src_coords[0..], 0);
 
                 // Set the fixed dimension to the specified index
                 src_coords[dim] = index;
 
                 var dst_coords = try self.allocator.alloc(usize, result.shape.len);
                 defer self.allocator.free(dst_coords);
-                @memset(dst_coords, 0);
+                @memset(dst_coords[0..], 0);
 
                 // Copy data
                 const total_elements = calculateSize(result.shape);
@@ -444,7 +445,7 @@ pub fn Tensor(comptime DataType: type) type {
             // Copy data with proper indexing
             var coords = try self.allocator.alloc(usize, self.shape.len);
             defer self.allocator.free(coords);
-            @memset(coords, 0);
+            @memset(coords[0..], 0);
 
             var result_idx: usize = 0;
             while (true) {
@@ -568,7 +569,7 @@ pub fn Tensor(comptime DataType: type) type {
         /// - An error if the tensor cannot be copied for some reason.
         pub fn copy(self: Self) !Self {
             const new_tensor = try Self.init(self.allocator, self.shape);
-            @memcpy(new_tensor.data, self.data);
+            @memcpy(new_tensor.data[0..], self.data);
             return new_tensor;
         }
 

From ccd0c69dc2d9593f90ae526f1b048b031b014b4a Mon Sep 17 00:00:00 2001
From: Gaiden-Spence <gaiden.anime@protonmail.com>
Date: Sat, 18 Oct 2025 23:57:51 -0400
Subject: [PATCH 7/7] added actual lengths to the memset and changed enum value
 to .float

---
 src/tests.zig | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/tests.zig b/src/tests.zig
index 57e0c3a..74ec05c 100644
--- a/src/tests.zig
+++ b/src/tests.zig
@@ -413,7 +413,7 @@ test "complex transpose operations" {
             5.0, -6.0,  7.0,  -8.0,
             9.0, -10.0, 11.0, -12.0,
         };
-        @memcpy(tensor.data, &pattern);
+        @memcpy(tensor.data[0..], &pattern);
 
         try ops.transpose(f32, &tensor);
 
@@ -460,7 +460,7 @@ test "complex transpose operations" {
             4, 5, 6,
             7, 8, 9,
         };
-        @memcpy(tensor.data, &pattern);
+        @memcpy(tensor.data[0..], &pattern);
 
         try ops.transpose(f32, &tensor);
 
@@ -479,7 +479,7 @@ test "complex transpose operations" {
         defer row_tensor.deinit();
 
         const row_data = [_]f32{ 1, 2, 3, 4, 5 };
-        @memcpy(row_tensor.data, &row_data);
+        @memcpy(row_tensor.data[0..], &row_data);
 
         try ops.transpose(f32, &row_tensor);
         try expectEqual(@as(usize, 5), row_tensor.shape[0]);
@@ -490,7 +490,7 @@ test "complex transpose operations" {
         defer col_tensor.deinit();
 
         const col_data = [_]f32{ 1, 2, 3, 4, 5 };
-        @memcpy(col_tensor.data, &col_data);
+        @memcpy(col_tensor.data[0..], &col_data);
 
         try ops.transpose(f32, &col_tensor);
         try expectEqual(@as(usize, 1), col_tensor.shape[0]);
@@ -1508,11 +1508,11 @@ test "layerNorm basic functionality" {
     // Create weight and bias tensors
     var weight = try Tensor(f32).init(allocator, &[_]usize{3});
     defer weight.deinit();
-    @memset(weight.data, 1.0); // Scale factor of 1
+    @memset(weight.data[0..], 1.0); // Scale factor of 1
 
     var bias = try Tensor(f32).init(allocator, &[_]usize{3});
     defer bias.deinit();
-    @memset(bias.data, 0.0); // No bias
+    @memset(bias.data[0..], 0.0); // No bias
 
     // Apply layer normalization
     var result = try ops.layerNorm(f32, input, weight, bias, 1e-5);
@@ -1552,11 +1552,11 @@ test "layerNorm stability checks" {
 
         var weight = try Tensor(f32).init(allocator, &[_]usize{3});
         defer weight.deinit();
-        @memset(weight.data, 1.0);
+        @memset(weight.data[0..], 1.0);
 
         var bias = try Tensor(f32).init(allocator, &[_]usize{3});
         defer bias.deinit();
-        @memset(bias.data, 0.0);
+        @memset(bias.data[0..], 0.0);
 
         try testing.expectError(error.HasNaN, ops.layerNorm(f32, input, weight, bias, 1e-5));
     }
@@ -1565,15 +1565,15 @@ test "layerNorm stability checks" {
     {
         var input = try Tensor(f32).init(allocator, &[_]usize{ 2, 3 });
         defer input.deinit();
-        @memset(input.data, 1.0); // All same values -> zero variance
+        @memset(input.data[0..], 1.0); // All same values -> zero variance
 
         var weight = try Tensor(f32).init(allocator, &[_]usize{3});
         defer weight.deinit();
-        @memset(weight.data, 1.0);
+        @memset(weight.data[0..], 1.0);
 
         var bias = try Tensor(f32).init(allocator, &[_]usize{3});
         defer bias.deinit();
-        @memset(bias.data, 0.0);
+        @memset(bias.data[0..], 0.0);
 
         var result = try ops.layerNorm(f32, input, weight, bias, 1e-5);
         defer result.deinit();
@@ -1586,15 +1586,15 @@ test "layerNorm stability checks" {
     {
         var input = try Tensor(f32).init(allocator, &[_]usize{ 2, 3 });
         defer input.deinit();
-        @memset(input.data, 1.0);
+        @memset(input.data[0..], 1.0);
 
         var weight = try Tensor(f32).init(allocator, &[_]usize{3});
         defer weight.deinit();
-        @memset(weight.data, 1.0);
+        @memset(weight.data[0..], 1.0);
 
         var bias = try Tensor(f32).init(allocator, &[_]usize{3});
         defer bias.deinit();
-        @memset(bias.data, 0.0);
+        @memset(bias.data[0..], 0.0);
 
         try testing.expectError(error.InvalidEpsilon, ops.layerNorm(f32, input, weight, bias, -1e-5));
     }
@@ -2425,7 +2425,7 @@ test "argmax with empty tensor" {
 // Helper function to create and fill a tensor with test data
 fn createTestTensor(comptime T: type, allocator: std.mem.Allocator, shape: []const usize, data: []const T) !Tensor(T) {
     const tensor = try Tensor(T).init(allocator, shape);
-    @memcpy(tensor.data, data);
+    @memcpy(tensor.data[0..], data);
     return tensor;
 }
 
@@ -2438,7 +2438,7 @@ pub fn compareTensors(comptime T: type, expected: Tensor(T), actual: Tensor(T))
 
     for (expected.data, actual.data, 0..) |exp, act, i| {
         switch (@typeInfo(T)) {
-            .Float => {
+            .float => {
                 // Handle special values
                 if (std.math.isNan(exp)) {
                     if (!std.math.isNan(act)) {
@@ -2920,8 +2920,8 @@ test "matmul numerical stability" {
 
         // Using smaller values to avoid overflow
         const large: f32 = 1e3;
-        @memset(a.data, large);
-        @memset(b.data, large);
+        @memset(a.data[0..], large);
+        @memset(b.data[0..], large);
 
         var result = try ops.matmul(f32, a, b, allocator);
         defer result.deinit();
@@ -2944,8 +2944,8 @@ test "matmul numerical stability" {
         defer b.deinit();
 
         const small: f32 = 1e-3;
-        @memset(a.data, small);
-        @memset(b.data, small);
+        @memset(a.data[0..], small);
+        @memset(b.data[0..], small);
 
         var result = try ops.matmul(f32, a, b, allocator);
         defer result.deinit();