From d3e82b8e3620e9aee4c9b30b895cc9a64a1800ac Mon Sep 17 00:00:00 2001 From: Eric Foley Date: Thu, 20 Mar 2025 08:03:45 -0600 Subject: [PATCH 1/2] feat: print run time from main script --- scripts/main.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/main.py b/scripts/main.py index 17024e9..f7abf24 100755 --- a/scripts/main.py +++ b/scripts/main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse from pathlib import Path +import time import numpy as np from transformers import AutoTokenizer @@ -27,6 +28,9 @@ def test(tokenizer, model, state) -> str: token_ids = tokenizer(text)["input_ids"] + # start timing here + start_time = time.time() + # feed in the prompt pos = 0 logits = None @@ -37,18 +41,23 @@ def test(tokenizer, model, state) -> str: output_tokens = [] + prompt_time = time.time() - start_time + # sample starting with last token of prompt last_output_token_id = np.argmax(logits) output_tokens.append(last_output_token_id) # - for _ in range(20): + for _ in range(64): logits = forward(model, state, last_output_token_id, pos) pos += 1 assert np.all(np.isfinite(logits)) - print(logits) last_output_token_id = np.argmax(logits) output_tokens.append(last_output_token_id) + end_time = time.time() - start_time + + print(f"Prompt time: {prompt_time:.4f}s") + print(f"Total time: {end_time:.4f}s") print(tokenizer.decode(output_tokens, skip_special_tokens=True)) From 97d161b9cf5f34250e931a2f00e748663c6e8b13 Mon Sep 17 00:00:00 2001 From: Eric Foley Date: Thu, 20 Mar 2025 08:04:17 -0600 Subject: [PATCH 2/2] feat: test speed using blas (on Mac) This gives about 30-40x speedup on Qwen2-0.5B-Instruct. That is probably due to both threading and vectorization. --- Makefile | 4 +++- src/matmul.c | 29 ++++++++++++++++------------- src/matmul.h | 9 --------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index fc53458..42e5d7b 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,9 @@ OBJS=$(SOURCES:%=$(BUILD)/%.o) ifeq ($(UNAME),Darwin) - CFLAGS+=-I/Library/Developer/CommandLineTools/SDKs/MacOSX15.2.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers + FRAMEWORKS_DIR=/Library/Developer/CommandLineTools/SDKs/MacOSX15.2.sdk/System/Library/Frameworks/ + ACCELERATE_HEADERS=$(FRAMEWORKS_DIR)/Accelerate.framework/Versions/A/Headers + CFLAGS+=-I$(ACCELERATE_HEADERS) -DACCELERATE_NEW_LAPACK LDFLAGS+=-dynamiclib -framework Accelerate else LDFLAGS+=-shared diff --git a/src/matmul.c b/src/matmul.c index b1db9f2..a556dd1 100644 --- a/src/matmul.c +++ b/src/matmul.c @@ -1,21 +1,22 @@ #include "matmul.h" -void mm(const float *restrict A, const float *restrict B, float *restrict C, int M, int N, int K) +#ifdef __APPLE__ +#include +#endif + +#ifdef __APPLE__ +void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N) { - for (int i = 0; i < M; ++i) - { - for (int j = 0; j < N; ++j) - { - float sum = 0.0; - for (int k = 0; k < K; ++k) - { - sum += A[i * K + k] * B[k * N + j]; - } - C[i * N + j] = sum; - } - } + memcpy(y, b, sizeof(float) * M); // Copy b to y + cblas_sgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0f, A, N, x, 1, 1.0f, y, 1); +} + +void mv(const float *restrict A, const float *restrict x, float *restrict y, int M, int N) +{ + cblas_sgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0f, A, N, x, 1, 0.0f, y, 1); } +#else void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N) { for (int i = 0; i < M; ++i) @@ -40,3 +41,5 @@ void mv(const float *restrict A, const float *restrict x, float *restrict y, int } } } + +#endif \ No newline at end of file diff --git a/src/matmul.h b/src/matmul.h index 7d94039..31b8275 100644 --- a/src/matmul.h +++ b/src/matmul.h @@ -1,14 +1,5 @@ #pragma once -/** - * Perform matrix multiplication of two matrices A and B. - * - * @param A Pointer to the first matrix, with dimensions M x K. - * @param B Pointer to the second matrix, with dimensions K x N. - * @param C Pointer to the result matrix, with dimensions M x N. - */ -void mm(const float *restrict A, const float *restrict B, float *restrict C, int M, int N, int K); - /** * Compute * y = Ax + b