Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ pub fn build(b: *std.Build) void {

// Tests
const tests = b.addTest(.{
.root_source_file = b.path("src/tests.zig"),
.target = target,
.optimize = .ReleaseSafe, // ReleaseSafe for tests
.root_module = b.createModule(.{
.root_source_file = b.path("src/tests.zig"),
.target = target,
.optimize = .ReleaseSafe,
}),
});

const run_tests = b.addRunArtifact(tests);
Expand All @@ -42,11 +44,12 @@ pub fn build(b: *std.Build) void {
// Benchmarks
const bench = b.addExecutable(.{
.name = "bench",
.root_source_file = b.path("src/bench.zig"),
.target = target,
.optimize = .ReleaseFast, // ReleaseFast for benchmarks
.root_module = b.createModule(.{
.root_source_file = b.path("src/bench.zig"),
.target = target,
.optimize = .ReleaseFast,
}),
});

const run_bench = b.addRunArtifact(bench);

// Create a step for running the benchmarks
Expand Down
5 changes: 3 additions & 2 deletions build.zig.zon
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
.{
.name = "katana",
.name = .katana,
.fingerprint = 0x1b2081fce0df035f,
.version = "0.1.0",

.minimum_zig_version = "0.13.0",
.minimum_zig_version = "0.15.2",

.paths = .{
// Include all source files
Expand Down
23 changes: 17 additions & 6 deletions src/bench.zig
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub fn calculateGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usiz
defer b.deinit();

// Initialize with random data
var prng = std.rand.DefaultPrng.init(0);
var prng = std.Random.DefaultPrng.init(0);
var random = prng.random();
for (a.data) |*val| val.* = random.float(f32);
for (b.data) |*val| val.* = random.float(f32);
Expand Down Expand Up @@ -53,7 +53,14 @@ pub fn main() !void {
defer arena.deinit();
const allocator = arena.allocator();

// Define test sizes
// 1. Define an explicit buffer for the stdout writer
var stdout_buffer: [1024]u8 = undefined;

// 2. Get the file handle and create the buffered writer
var stdout_writer_wrapper = std.fs.File.stdout().writer(&stdout_buffer);
const stdout = &stdout_writer_wrapper.interface;

//Define test sizes
const sizes = [_]struct { m: usize, n: usize, k: usize }{
.{ .m = 256, .n = 256, .k = 256 },
.{ .m = 512, .n = 512, .k = 512 },
Expand All @@ -68,12 +75,16 @@ pub fn main() !void {

const iterations = 5;

try std.io.getStdOut().writer().print("\nRunning MatMul Benchmark\n", .{});
try std.io.getStdOut().writer().print("T = {d} \n", .{T});
try std.io.getStdOut().writer().print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});
// 3. Use the new 'stdout' Writer interface pointer for printing
try stdout.print("\nRunning MatMul Benchmark\n", .{});
try stdout.print("T = {d} \n", .{T});
try stdout.print("Number of threads = {d}\n", .{try std.Thread.getCpuCount()});

for (sizes) |size| {
const avg_gflops = try calculateGflops(allocator, size.m, size.n, size.k, iterations);
try std.io.getStdOut().writer().print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
try stdout.print("Matrix size: {d}x{d}x{d}, GFLOPS: {d:.2}\n", .{ size.m, size.n, size.k, avg_gflops });
}

// 4. IMPORTANT: Flush the buffer to ensure all output is written to the terminal
try stdout.flush();
}
41 changes: 22 additions & 19 deletions src/ops.zig
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ pub fn transpose(comptime T: type, tensor: *Tensor(T)) !void {

const rows = tensor.shape[0];
const cols = tensor.shape[1];
var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), 32, rows * cols);
const alignment: std.mem.Alignment = .@"32";
var new_data = try tensor.allocator.alignedAlloc(@TypeOf(tensor.data[0]), alignment, rows * cols);

for (0..rows) |i| {
for (0..cols) |j| {
Expand Down Expand Up @@ -161,7 +162,8 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
}

// Allocate memory for transposed data
var new_data = try tensor.allocator.alignedAlloc(T, 32, tensor.data.len);
const alignment: std.mem.Alignment = .@"32";
var new_data = try tensor.allocator.alignedAlloc(T, alignment, tensor.data.len);
errdefer tensor.allocator.free(new_data);

// Calculate new strides
Expand All @@ -177,7 +179,7 @@ pub fn transposeAxes(comptime T: type, tensor: *Tensor(T), dim0: usize, dim1: us
// Create coordinate arrays
var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
defer tensor.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

// Perform the transpose operation
const total_elements = tensor.data.len;
Expand Down Expand Up @@ -334,7 +336,7 @@ pub fn getChunk(comptime T: type, tensor: Tensor(T), dim: usize, chunk_idx: usiz
var result_idx: usize = 0;
var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
defer tensor.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

while (result_idx < total_elements) : (result_idx += 1) {
// Calculate source coordinates
Expand Down Expand Up @@ -446,7 +448,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
if (first_size > 0) {
var coords = try tensor.allocator.alloc(usize, tensor.shape.len);
defer tensor.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

var idx: usize = 0;
while (idx < first_size) : (idx += 1) {
Expand Down Expand Up @@ -494,7 +496,7 @@ pub fn concat(comptime T: type, tensor: Tensor(T), other: Tensor(T), dim: usize)
if (second_size > 0) {
var coords = try tensor.allocator.alloc(usize, other.shape.len);
defer tensor.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

var idx: usize = 0;
while (idx < second_size) : (idx += 1) {
Expand Down Expand Up @@ -644,7 +646,7 @@ pub fn stack(comptime T: type, tensors: []const Tensor(T), dim: usize) !Tensor(T
// Copy data from each input tensor
var coords = try ref_tensor.allocator.alloc(usize, result.shape.len);
defer ref_tensor.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

const elements_per_tensor = calculateSize(ref_shape);

Expand Down Expand Up @@ -809,7 +811,7 @@ pub fn randomTensor(comptime T: type, allocator: std.mem.Allocator, shape: []con
var tensor = try Tensor(T).init(allocator, shape);
errdefer tensor.deinit();

var rng = std.rand.DefaultPrng.init(seed);
var rng = std.Random.DefaultPrng.init(seed);
for (tensor.data) |*val| {
val.* = rng.random().float(T) * 2.0 - 1.0; // Values between -1 and 1
}
Expand Down Expand Up @@ -850,14 +852,14 @@ pub fn zeros(comptime T: type, allocator: Allocator, shape: []const usize) !Tens
}

// Allocate aligned data array
const alignment = 32;
const alignment: std.mem.Alignment = .@"32";
const data = try allocator.alignedAlloc(T, alignment, total_size);
// Initialize all elements to zero
@memset(data, 0);
@memset(data[0..], 0);

// Create tensor shape
const tensor_shape = try allocator.alloc(usize, shape.len);
@memcpy(tensor_shape, shape);
@memcpy(tensor_shape[0..], shape);

// Return initialized tensor
return Tensor(T){
Expand Down Expand Up @@ -954,7 +956,7 @@ pub fn getStabilityInfo(comptime T: type, tensor: Tensor(T)) !Tensor(T).Stabilit
var info = Tensor(@TypeOf(tensor.data[0])).StabilityInfo{};

switch (@typeInfo(@TypeOf(tensor.data[0]))) {
.Float => {
.float => {
for (tensor.data, 0..) |value, i| {
if (std.math.isNan(value)) {
info.has_nan = true;
Expand Down Expand Up @@ -1054,7 +1056,7 @@ pub fn hasInf(comptime T: type, tensor: Tensor(T)) !bool {
/// ```
pub fn replaceUnstable(comptime T: type, tensor: *Tensor(T), replacement: T) !void {
switch (@typeInfo(@TypeOf(tensor.data[0]))) {
.Float => {
.float => {
for (tensor.data) |*value| {
if (std.math.isNan(value.*) or std.math.isInf(value.*)) {
value.* = replacement;
Expand Down Expand Up @@ -1377,7 +1379,7 @@ pub fn broadcast_multiply(comptime T: type, a: *Tensor(T), b: Tensor(T)) !void {
}

// Copy result back to a
@memcpy(a.data, result.data);
@memcpy(a.data[0..], result.data);
}

/// Helper function for broadcasting subtraction.
Expand Down Expand Up @@ -1479,7 +1481,7 @@ pub fn matmul(comptime T: type, a: Tensor(T), b: Tensor(T), allocator: Allocator
errdefer result.deinit();

// Initialize result to zero
@memset(result.data, 0);
@memset(result.data[0..], 0);

// Simple triple-loop matrix multiplication
for (0..M) |i| {
Expand All @@ -1505,7 +1507,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
errdefer result.deinit();

// Initialize result to zero
@memset(result.data, 0);
@memset(result.data[0..], 0);

// Calculate tile grid dimensions
const tiles_M = (M + Tile - 1) / Tile;
Expand All @@ -1520,7 +1522,7 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten

// Create thread pool
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
defer thread_pool.deinit();
defer thread_pool.deinit(allocator);

// Create thread context
const context = ThreadContext{
Expand All @@ -1544,7 +1546,8 @@ fn optimizedMatmulF32(a: Tensor(f32), b: Tensor(f32), allocator: Allocator) !Ten
};

for (0..num_threads) |_| {
try thread_pool.append(try std.Thread.spawn(.{}, WorkerFn.worker, .{context}));
const thread = try std.Thread.spawn(.{}, WorkerFn.worker, .{context});
try thread_pool.append(allocator, thread);
}

// Wait for all threads to complete
Expand Down Expand Up @@ -2009,7 +2012,7 @@ fn softmax(comptime T: type, tensor: *Tensor(T), dim: usize) !void {
/// Note:
/// - If the array is empty, the behavior of this function is undefined.
pub fn gelu(comptime T: type, tensor: *Tensor(T)) !void {
if (@typeInfo(T) != .Float) {
if (@typeInfo(T) != .float) {
@compileError("GELU operation requires floating-point tensor");
}

Expand Down
19 changes: 10 additions & 9 deletions src/tensor.zig
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,16 @@ pub fn Tensor(comptime DataType: type) type {
}
}
const shape_copy = try allocator.alloc(usize, shape.len);
@memcpy(shape_copy, shape);
@memcpy(shape_copy[0..], shape);

// Now we know size fits in usize
const final_size: usize = @intCast(size);
const data = try allocator.alignedAlloc(DataType, 32, final_size);
const alignment: std.mem.Alignment = .@"32";
const data = try allocator.alignedAlloc(DataType, alignment, final_size);
if (DataType == bool) {
@memset(data, false);
@memset(data[0..], false);
} else {
@memset(data, 0);
@memset(data[0..], 0);
}

const self = Tensor(DataType){
Expand Down Expand Up @@ -163,7 +164,7 @@ pub fn Tensor(comptime DataType: type) type {

// Update shape
const new_shape_copy = try self.allocator.alloc(usize, new_shape.len);
@memcpy(new_shape_copy, new_shape);
@memcpy(new_shape_copy[0..], new_shape);

self.allocator.free(self.shape);
self.shape = new_shape_copy;
Expand Down Expand Up @@ -338,14 +339,14 @@ pub fn Tensor(comptime DataType: type) type {
// Create coordinate arrays
var src_coords = try self.allocator.alloc(usize, self.shape.len);
defer self.allocator.free(src_coords);
@memset(src_coords, 0);
@memset(src_coords[0..], 0);

// Set the fixed dimension to the specified index
src_coords[dim] = index;

var dst_coords = try self.allocator.alloc(usize, result.shape.len);
defer self.allocator.free(dst_coords);
@memset(dst_coords, 0);
@memset(dst_coords[0..], 0);

// Copy data
const total_elements = calculateSize(result.shape);
Expand Down Expand Up @@ -444,7 +445,7 @@ pub fn Tensor(comptime DataType: type) type {
// Copy data with proper indexing
var coords = try self.allocator.alloc(usize, self.shape.len);
defer self.allocator.free(coords);
@memset(coords, 0);
@memset(coords[0..], 0);

var result_idx: usize = 0;
while (true) {
Expand Down Expand Up @@ -568,7 +569,7 @@ pub fn Tensor(comptime DataType: type) type {
/// - An error if the tensor cannot be copied for some reason.
pub fn copy(self: Self) !Self {
const new_tensor = try Self.init(self.allocator, self.shape);
@memcpy(new_tensor.data, self.data);
@memcpy(new_tensor.data[0..], self.data);
return new_tensor;
}

Expand Down
Loading