From d3e82b8e3620e9aee4c9b30b895cc9a64a1800ac Mon Sep 17 00:00:00 2001
From: Eric Foley <ericdfoley@gmail.com>
Date: Thu, 20 Mar 2025 08:03:45 -0600
Subject: [PATCH 1/2] feat: print run time from main script

---
 scripts/main.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/main.py b/scripts/main.py
index 17024e9..f7abf24 100755
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 from pathlib import Path
+import time
 
 import numpy as np
 from transformers import AutoTokenizer
@@ -27,6 +28,9 @@ def test(tokenizer, model, state) -> str:
 
     token_ids = tokenizer(text)["input_ids"]
 
+    # start timing here
+    start_time = time.time()
+
     # feed in the prompt
     pos = 0
     logits = None
@@ -37,18 +41,23 @@ def test(tokenizer, model, state) -> str:
 
     output_tokens = []
 
+    prompt_time = time.time() - start_time
+
     # sample starting with last token of prompt
     last_output_token_id = np.argmax(logits)
     output_tokens.append(last_output_token_id)
     #
-    for _ in range(20):
+    for _ in range(64):
         logits = forward(model, state, last_output_token_id, pos)
         pos += 1
         assert np.all(np.isfinite(logits))
-        print(logits)
         last_output_token_id = np.argmax(logits)
         output_tokens.append(last_output_token_id)
 
+    end_time = time.time() - start_time 
+
+    print(f"Prompt time: {prompt_time:.4f}s")
+    print(f"Total time: {end_time:.4f}s")
     print(tokenizer.decode(output_tokens, skip_special_tokens=True))
 
 

From 97d161b9cf5f34250e931a2f00e748663c6e8b13 Mon Sep 17 00:00:00 2001
From: Eric Foley <ericdfoley@gmail.com>
Date: Thu, 20 Mar 2025 08:04:17 -0600
Subject: [PATCH 2/2] feat: test speed using blas (on Mac)

This gives about 30-40x speedup on Qwen2-0.5B-Instruct.
That is probably due to both threading and vectorization.
---
 Makefile     |  4 +++-
 src/matmul.c | 29 ++++++++++++++++-------------
 src/matmul.h |  9 ---------
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index fc53458..42e5d7b 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,9 @@ OBJS=$(SOURCES:%=$(BUILD)/%.o)
 
 
 ifeq ($(UNAME),Darwin)
-	CFLAGS+=-I/Library/Developer/CommandLineTools/SDKs/MacOSX15.2.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
+	FRAMEWORKS_DIR=/Library/Developer/CommandLineTools/SDKs/MacOSX15.2.sdk/System/Library/Frameworks/
+	ACCELERATE_HEADERS=$(FRAMEWORKS_DIR)/Accelerate.framework/Versions/A/Headers
+	CFLAGS+=-I$(ACCELERATE_HEADERS) -DACCELERATE_NEW_LAPACK
 	LDFLAGS+=-dynamiclib -framework Accelerate
 else
 	LDFLAGS+=-shared
diff --git a/src/matmul.c b/src/matmul.c
index b1db9f2..a556dd1 100644
--- a/src/matmul.c
+++ b/src/matmul.c
@@ -1,21 +1,22 @@
 #include "matmul.h"
 
-void mm(const float *restrict A, const float *restrict B, float *restrict C, int M, int N, int K)
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#endif
+
+#ifdef __APPLE__
+void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N)
 {
-    for (int i = 0; i < M; ++i)
-    {
-        for (int j = 0; j < N; ++j)
-        {
-            float sum = 0.0;
-            for (int k = 0; k < K; ++k)
-            {
-                sum += A[i * K + k] * B[k * N + j];
-            }
-            C[i * N + j] = sum;
-        }
-    }
+    memcpy(y, b, sizeof(float) * M); // Copy b to y
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0f, A, N, x, 1, 1.0f, y, 1);
+}
+
+void mv(const float *restrict A, const float *restrict x, float *restrict y, int M, int N)
+{
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0f, A, N, x, 1, 0.0f, y, 1);
 }
 
+#else
 void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N)
 {
     for (int i = 0; i < M; ++i)
@@ -40,3 +41,5 @@ void mv(const float *restrict A, const float *restrict x, float *restrict y, int
         }
     }
 }
+
+#endif
\ No newline at end of file
diff --git a/src/matmul.h b/src/matmul.h
index 7d94039..31b8275 100644
--- a/src/matmul.h
+++ b/src/matmul.h
@@ -1,14 +1,5 @@
 #pragma once
 
-/**
- * Perform matrix multiplication of two matrices A and B.
- *
- * @param A Pointer to the first matrix, with dimensions M x K.
- * @param B Pointer to the second matrix, with dimensions K x N.
- * @param C Pointer to the result matrix, with dimensions M x N.
- */
-void mm(const float *restrict A, const float *restrict B, float *restrict C, int M, int N, int K);
-
 /**
  * Compute
  * y = Ax + b