snowclipsed · Gaiden-Spence · Oct 15, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/build.zig b/build.zig
@@ -24,9 +24,11 @@ pub fn build(b: *std.Build) void {
 
     // Tests
     const tests = b.addTest(.{
-        .root_source_file = b.path("src/tests.zig"),
-        .target = target,
-        .optimize = .ReleaseSafe, // ReleaseSafe for tests
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/tests.zig"),
+            .target = target,
+            .optimize = .ReleaseSafe,
+        }),
     });
 
     const run_tests = b.addRunArtifact(tests);
@@ -42,11 +44,12 @@ pub fn build(b: *std.Build) void {
     // Benchmarks
     const bench = b.addExecutable(.{
         .name = "bench",
-        .root_source_file = b.path("src/bench.zig"),
-        .target = target,
-        .optimize = .ReleaseFast, // ReleaseFast for benchmarks
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/bench.zig"),
+            .target = target,
+            .optimize = .ReleaseFast,
+        }),
     });
-
     const run_bench = b.addRunArtifact(bench);
 
     // Create a step for running the benchmarks

diff --git a/build.zig.zon b/build.zig.zon
@@ -1,8 +1,9 @@
 .{
-    .name = "katana",
+    .name = .katana,
+    .fingerprint = 0x1b2081fce0df035f,
     .version = "0.1.0",
 
-    .minimum_zig_version = "0.13.0",
+    .minimum_zig_version = "0.15.2",
 
     .paths = .{
         // Include all source files

diff --git a/src/bench.zig b/src/bench.zig
@@ -15,7 +15,7 @@ pub fn calculateGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usiz
     defer b.deinit();
 
     // Initialize with random data
-    var prng = std.rand.DefaultPrng.init(0);
+    var prng = std.Random.DefaultPrng.init(0);
     var random = prng.random();
     for (a.data) |*val| val.* = random.float(f32);
     for (b.data) |*val| val.* = random.float(f32);
@@ -53,7 +53,14 @@ pub fn main() !void {
     defer arena.deinit();
     const allocator = arena.allocator();
 
-    // Define test sizes
+    // 1. Define an explicit buffer for the stdout writer
+    var stdout_buffer: [1024]u8 = undefined;
+
+    // 2. Get the file handle and create the buffered writer
+    var stdout_writer_wrapper = std.fs.File.stdout().writer(&stdout_buffer);
+    const stdout = &stdout_writer_wrapper.interface;
+
+    //Define test sizes
     const sizes = [_]struct { m: usize, n: usize, k: usize }{
         .{ .m = 256, .n = 256, .k = 256 },
         .{ .m = 512, .n = 512, .k = 512 },
@@ -68,12 +75,16 @@ pub fn main() !void {
 
     const iterations = 5;
 
-    try std.io.getStdOut().writer().print("\nRunning MatMul Benchmark\n", .{});
-    try std.io.getStdOut().writer().print("T = {d} \n", .{T});
-    try std.io.getStdOut().writer().print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});
+    // 3. Use the new 'stdout' Writer interface pointer for printing
+    try stdout.print("\nRunning MatMul Benchmark\n", .{});
+    try stdout.print("T = {d} \n", .{T});
+    try stdout.print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});
 
     for (sizes) |size| {
         const avg_gflops = try calculateGflops(allocator, size.m, size.n, size.k, iterations);
-        try std.io.getStdOut().writer().print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
+        try stdout.print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
     }
+
+    // 4. IMPORTANT: Flush the buffer to ensure all output is written to the terminal
+    try stdout.flush();
 }
diff --git a/src/ops.zig b/src/ops.zig
@@ -87,7 +87,8 @@ pub fn transpose(comptime T: type, tensor: *Tensor(T)) !void {
 
     const rows = tensor.shape[0];
     const cols = tensor.shape[1];
-    var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), 32, rows * cols);
+    const alignment: std.mem.Alignment = .@"32";
+    var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), alignment, rows * cols);
 
     for (0..rows) |i| {
         for (0..cols) |j| {
@@ -161,7 +162,8 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
     }
 
     // Allocate memory for transposed data
-    var new_data = try tensor.allocator.alignedAlloc(T, 32, tensor.data.len);
+    const alignment: std.mem.Alignment = .@"32";
+    var new_data = try tensor.allocator.alignedAlloc(T, alignment, tensor.data.len);
     errdefer tensor.allocator.free(new_data);
 
     // Calculate new strides
@@ -177,7 +179,7 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
     // Create coordinate arrays
     var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
     defer tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     // Perform the transpose operation
     const total_elements = tensor.data.len;
@@ -334,7 +336,7 @@ pub fn getChunk(comptime T: type, tensor: Tensor(T), dim: usize, chunk_idx: usiz
     var result_idx: usize = 0;
     var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
     defer tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     while (result_idx < total_elements) : (result_idx += 1) {
         // Calculate source coordinates
@@ -446,7 +448,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
     if (first_size > 0) {
         var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
         defer tensor.allocator.free(coords);
-        @memset(coords, 0);
+        @memset(coords[0..], 0);
 
         var idx: usize = 0;
         while (idx < first_size) : (idx += 1) {
@@ -494,7 +496,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
     if (second_size > 0) {
         var coords = try tensor.allocator.alloc(usize, other.shape.len);
         defer tensor.allocator.free(coords);
-        @memset(coords, 0);
+        @memset(coords[0..], 0);
 
         var idx: usize = 0;
         while (idx < second_size) : (idx += 1) {
@@ -644,7 +646,7 @@ pub fn stack(comptime T: type, tensors: []const Tensor(T), dim: usize) !Tensor(T
     // Copy data from each input tensor
     var coords = try ref_tensor.allocator.alloc(usize, result.shape.len);
     defer ref_tensor.allocator.free(coords);
-    @memset(coords, 0);
+    @memset(coords[0..], 0);
 
     const elements_per_tensor = calculateSize(ref_shape);
 
@@ -809,7 +811,7 @@ pub fn randomTensor(comptime T: type, allocator: std.mem.Allocator, shape: []con
     var tensor = try Tensor(T).init(allocator, shape);
     errdefer tensor.deinit();
 
-    var rng = std.rand.DefaultPrng.init(seed);
+    var rng = std.Random.DefaultPrng.init(seed);
     for (tensor.data) |*val| {
         val.* = rng.random().float(T) * 2.0 - 1.0; // Values between -1 and 1
     }
@@ -850,14 +852,14 @@ pub fn zeros(comptime T: type, allocator: Allocator, shape: []const usize) !Tens
     }
 
     // Allocate aligned data array
-    const alignment = 32;
+    const alignment: std.mem.Alignment = .@"32";
     const data = try allocator.alignedAlloc(T, alignment, total_size);
     // Initialize all elements to zero
-    @memset(data, 0);
+    @memset(data[0..], 0);
 
     // Create tensor shape
     const tensor_shape = try allocator.alloc(usize, shape.len);
-    @memcpy(tensor_shape, shape);
+    @memcpy(tensor_shape[0..], shape);
 
     // Return initialized tensor
     return Tensor(T){
@@ -954,7 +956,7 @@ pub fn getStabilityInfo(comptime T: type, tensor: Tensor(T)) !Tensor(T).Stabilit
     var info = Tensor(@TypeOf(tensor.data[0])).StabilityInfo{};
 
     switch (@typeInfo(@TypeOf(tensor.data[0]))) {
-        .Float => {
+        .float => {
             for (tensor.data, 0..) |value, i| {
                 if (std.math.isNan(value)) {
                     info.has_nan = true;
@@ -1054,7 +1056,7 @@ pub fn hasInf(comptime T: type, tensor: Tensor(T)) !bool {
 /// ```
 pub fn replaceUnstable(comptime T: type, tensor: *Tensor(T), replacement: T) !void {
     switch (@typeInfo(@TypeOf(tensor.data[0]))) {
-        .Float => {
+        .float => {
             for (tensor.data) |*value| {
                 if (std.math.isNan(value.*) or std.math.isInf(value.*)) {
                     value.* = replacement;
@@ -1377,7 +1379,7 @@ pub fn broadcast_multiply(comptime T: type, a: *Tensor(T), b: Tensor(T)) !void {
     }
 
     // Copy result back to a
-    @memcpy(a.data, result.data);
+    @memcpy(a.data[0..], result.data);
 }
 
 /// Helper function for broadcasting subtraction.
@@ -1479,7 +1481,7 @@ pub fn matmul(comptime T: type, a: Tensor(T), b: Tensor(T), allocator: Allocator
     errdefer result.deinit();
 
     // Initialize result to zero
-    @memset(result.data, 0);
+    @memset(result.data[0..], 0);
 
     // Simple triple-loop matrix multiplication
     for (0..M) |i| {
@@ -1505,7 +1507,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
     errdefer result.deinit();
 
     // Initialize result to zero
-    @memset(result.data, 0);
+    @memset(result.data[0..], 0);
 
     // Calculate tile grid dimensions
     const tiles_M = (M + Tile - 1) / Tile;
@@ -1520,7 +1522,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
 
     // Create thread pool
     var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
-    defer thread_pool.deinit();
+    defer thread_pool.deinit(allocator);
 
     // Create thread context
     const context = ThreadContext{
@@ -1544,7 +1546,8 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
     };
 
     for (0..num_threads) |_| {
-        try thread_pool.append(try std.Thread.spawn(.{}, WorkerFn.worker, .{context}));
+        const thread = try std.Thread.spawn(.{}, WorkerFn.worker, .{context});
+        try thread_pool.append(allocator, thread);
     }
 
     // Wait for all threads to complete
@@ -2009,7 +2012,7 @@ fn softmax(comptime T: type, tensor: *Tensor(T), dim: usize) !void {
 /// Note:
 /// - If the array is empty, the behavior of this function is undefined.
 pub fn gelu(comptime T: type, tensor: *Tensor(T)) !void {
-    if (@typeInfo(T) != .Float) {
+    if (@typeInfo(T) != .float) {
         @compileError("GELU operation requires floating-point tensor");
     }
 

diff --git a/src/tensor.zig b/src/tensor.zig
@@ -44,15 +44,16 @@ pub fn Tensor(comptime DataType: type) type {
                 }
             }
             const shape_copy = try allocator.alloc(usize, shape.len);
-            @memcpy(shape_copy, shape);
+            @memcpy(shape_copy[0..], shape);
 
             // Now we know size fits in usize
             const final_size: usize = @intCast(size);
-            const data = try allocator.alignedAlloc(DataType, 32, final_size);
+            const alignment: std.mem.Alignment = .@"32";
+            const data = try allocator.alignedAlloc(DataType, alignment, final_size);
             if (DataType == bool) {
-                @memset(data, false);
+                @memset(data[0..], false);
             } else {
-                @memset(data, 0);
+                @memset(data[0..], 0);
             }
 
             const self = Tensor(DataType){
@@ -163,7 +164,7 @@ pub fn Tensor(comptime DataType: type) type {
 
             // Update shape
             const new_shape_copy = try self.allocator.alloc(usize, new_shape.len);
-            @memcpy(new_shape_copy, new_shape);
+            @memcpy(new_shape_copy[0..], new_shape);
 
             self.allocator.free(self.shape);
             self.shape = new_shape_copy;
@@ -338,14 +339,14 @@ pub fn Tensor(comptime DataType: type) type {
                 // Create coordinate arrays
                 var src_coords = try self.allocator.alloc(usize, self.shape.len);
                 defer self.allocator.free(src_coords);
-                @memset(src_coords, 0);
+                @memset(src_coords[0..], 0);
 
                 // Set the fixed dimension to the specified index
                 src_coords[dim] = index;
 
                 var dst_coords = try self.allocator.alloc(usize, result.shape.len);
                 defer self.allocator.free(dst_coords);
-                @memset(dst_coords, 0);
+                @memset(dst_coords[0..], 0);
 
                 // Copy data
                 const total_elements = calculateSize(result.shape);
@@ -444,7 +445,7 @@ pub fn Tensor(comptime DataType: type) type {
             // Copy data with proper indexing
             var coords = try self.allocator.alloc(usize, self.shape.len);
             defer self.allocator.free(coords);
-            @memset(coords, 0);
+            @memset(coords[0..], 0);
 
             var result_idx: usize = 0;
             while (true) {
@@ -568,7 +569,7 @@ pub fn Tensor(comptime DataType: type) type {
         /// - An error if the tensor cannot be copied for some reason.
         pub fn copy(self: Self) !Self {
             const new_tensor = try Self.init(self.allocator, self.shape);
-            @memcpy(new_tensor.data, self.data);
+            @memcpy(new_tensor.data[0..], self.data);
             return new_tensor;
         }