diff --git a/Makefile b/Makefile
index 6d6a967..04e3947 100644
--- a/Makefile
+++ b/Makefile
@@ -12,19 +12,16 @@ SOURCES=$(wildcard src/*.c)
 
 OBJS=$(SOURCES:%=$(BUILD)/%.o)
 
-
-
 ifeq ($(UNAME),Darwin)
 	FRAMEWORKS_DIR=/Library/Developer/CommandLineTools/SDKs/MacOSX15.2.sdk/System/Library/Frameworks/
 	ACCELERATE_HEADERS=$(FRAMEWORKS_DIR)/Accelerate.framework/Versions/A/Headers
 	CFLAGS+=-I$(ACCELERATE_HEADERS) -DACCELERATE_NEW_LAPACK
 	LDFLAGS+=-dynamiclib -framework Accelerate
 else
-	CFLAGS+=-fopenmp
+	CFLAGS+=-fopenmp -mavx2 -mfma
 	LDFLAGS+=-shared -fopenmp
 endif
 
-
 ifeq ($(UNAME),Darwin)
 	LIBRARY_NAME=libeel.dylib
 else
diff --git a/eel/eel.py b/eel/eel.py
index 55467b2..af4b1ff 100644
--- a/eel/eel.py
+++ b/eel/eel.py
@@ -1,5 +1,7 @@
 import ctypes
 import json
+import platform
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -9,8 +11,12 @@
 from ._cdefs import Config, InferState, LayerWeights, Model, Weights
 
 
-# TODO EDF don't hardcode
-LibPath = "/Users/eric/Development/eel/build/libeel.dylib"
+RootDir = Path(__file__).parent.parent
+
+if platform.system() == "Darwin":
+    LibPath = RootDir / "build" / "libeel.dylib"
+else:
+    LibPath = RootDir / "build" / "libeel.so"
 
 lib = ctypes.CDLL(LibPath)
 
@@ -274,34 +280,13 @@ def load_model(config: Config, weights: Weights) -> Model:
 
 
 def init_state(config: Config) -> InferState:
-    state = InferState()
-
-    state.x1 = (ctypes.c_float * config.size)()
-    state.x2 = (ctypes.c_float * config.size)()
-    state.x3 = (ctypes.c_float * config.size)()
-
-    state.h1 = (ctypes.c_float * config.ffn_hidden_size)()
-    state.h2 = (ctypes.c_float * config.ffn_hidden_size)()
-    state.h3 = (ctypes.c_float * config.ffn_hidden_size)()
-
-    kv_cache_size = (
-        config.num_layers * config.max_seq_len * config.num_kv_heads * config.head_size
+    lib.make_state.argtypes = (
+        ctypes.POINTER(Config),
     )
-    state.k_cache = (ctypes.c_float * kv_cache_size)()
-    state.v_cache = (ctypes.c_float * kv_cache_size)()
-
-    q_size = config.num_q_heads * config.head_size
-    kv_size = config.num_kv_heads * config.head_size
-    state.q = (ctypes.c_float * q_size)()
-    state.k = (ctypes.c_float * kv_size)()
-    state.v = (ctypes.c_float * kv_size)()
-
-    state.score = (ctypes.c_float * config.max_seq_len)()
-    state.mha_out = (ctypes.c_float * q_size)()
-
-    state.logits = (ctypes.c_float * config.vocab_size)()
+    lib.make_state.restype = ctypes.POINTER(InferState)
 
-    return state
+    state_ptr = lib.make_state(ctypes.pointer(config))
+    return _deref(state_ptr)
 
 
 def forward(model: Model, state: InferState, token: int, pos: int) -> np.array:
diff --git a/src/eel.c b/src/eel.c
index bd2db03..bc3470f 100644
--- a/src/eel.c
+++ b/src/eel.c
@@ -12,6 +12,37 @@
 
 #define EEL_DEBUG_STATE 0 // very slow!
 
+struct InferState *make_state(struct Config *config) {
+    struct InferState *state = malloc(sizeof(struct InferState));
+
+    state->x1 = aligned_alloc(32, config->size * sizeof(float));
+    state->x2 = aligned_alloc(32, config->size * sizeof(float));
+    state->x3 = aligned_alloc(32, config->size * sizeof(float));
+
+    state->h1 = aligned_alloc(32, config->ffn_hidden_size * sizeof(float));
+    state->h2 = aligned_alloc(32, config->ffn_hidden_size * sizeof(float));
+    state->h3 = aligned_alloc(32, config->ffn_hidden_size * sizeof(float));
+
+    size_t kv_cache_size = (
+        config->num_layers * config->max_seq_len * config->num_kv_heads * config->head_size
+    );
+    state->k_cache = aligned_alloc(32, kv_cache_size * sizeof(float));
+    state->v_cache = aligned_alloc(32, kv_cache_size * sizeof(float));
+
+    size_t q_size = config->num_q_heads * config->head_size;
+    size_t kv_size = config->num_kv_heads * config->head_size;
+    state->q = aligned_alloc(32, q_size * sizeof(float));
+    state->k = aligned_alloc(32, kv_size * sizeof(float));
+    state->v = aligned_alloc(32, kv_size * sizeof(float));
+
+    state->score = aligned_alloc(32, config->max_seq_len * sizeof(float));
+    state->mha_out = aligned_alloc(32, q_size * sizeof(float));
+
+    state->logits = aligned_alloc(32, config->vocab_size * sizeof(float));
+
+    return state;
+}
+
 /**
  * Forward one layer.
  *
diff --git a/src/eel.h b/src/eel.h
index 3935ede..9ea4019 100644
--- a/src/eel.h
+++ b/src/eel.h
@@ -88,6 +88,8 @@ struct Model {
     struct Weights *weights;
 };
 
+struct InferState *init_state(struct Config *config);
+
 /**
  * Forward pass of the model by one token.
  * 
diff --git a/src/matmul.c b/src/matmul.c
index 9cac89f..c787fd8 100644
--- a/src/matmul.c
+++ b/src/matmul.c
@@ -1,5 +1,8 @@
 #include "matmul.h"
 
+#include <assert.h>
+#include <stdint.h>
+
 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
 #endif
@@ -8,6 +11,10 @@
 #include <omp.h>
 #endif
 
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+
 #if defined(__APPLE__)
 void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N)
 {
@@ -20,6 +27,65 @@ void mv(const float *restrict A, const float *restrict x, float *restrict y, int
     cblas_sgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0f, A, N, x, 1, 0.0f, y, 1);
 }
 
+#elif defined(__AVX2__)
+
+// see https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
+float hsum_ps_sse3(__m128 v) {
+    __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
+    __m128 sums = _mm_add_ps(v, shuf);
+    shuf        = _mm_movehl_ps(shuf, sums); // high half -> low half
+    sums        = _mm_add_ss(sums, shuf);
+    return        _mm_cvtss_f32(sums);
+}
+
+float hsum256_ps_avx(__m256 v) {
+    __m128 vlow  = _mm256_castps256_ps128(v);
+    __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128
+           vlow  = _mm_add_ps(vlow, vhigh);     // add the low 128
+    return hsum_ps_sse3(vlow);         // and inline the sse3 version, which is optimal for AVX
+    // (no wasted instructions, and all of them are the 4B minimum)
+}
+
+void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N)
+{
+    assert((uintptr_t)A % 32 == 0);
+    assert((uintptr_t)x % 32 == 0);
+    
+    #pragma omp parallel for
+    for (int i = 0; i < M; ++i)
+    {
+        const float *rowA = &A[i * N];
+        __m256 sum = _mm256_setzero_ps();
+
+        for (int j = 0; j < N; j += 8)
+        {
+            __m256 vA = _mm256_loadu_ps(&rowA[j]);
+            __m256 vx = _mm256_loadu_ps(&x[j]);
+
+            // do the fma operation. there isn't any speed difference between
+            // using FMA vs plain AVX multiply and add. not surprising since we're
+            // surely just memory bound loading from A & x.
+            #if 1
+            sum = _mm256_fmadd_ps(vA, vx, sum); // sum += A[j] * x[j]
+            #else
+            __m256 vAx = _mm256_mul_ps(vA, vx);
+            sum = _mm256_add_ps(sum, vAx);
+            #endif
+        }
+
+        if (b) {
+            y[i] = hsum256_ps_avx(sum) + b[i];
+        } else {
+            y[i] = hsum256_ps_avx(sum);
+        }
+    }
+}
+
+void mv(const float *restrict A, const float *restrict x, float *restrict y, int M, int N)
+{
+    mva(A, x, NULL, y, M, N);
+}
+
 #else
 
 void mva(const float *restrict A, const float *restrict x, const float *restrict b, float *restrict y, int M, int N)