state-spaces · ChrisLundquist · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/mamba_ssm/ops/selective_scan_interface.py b/mamba_ssm/ops/selective_scan_interface.py
@@ -17,7 +17,10 @@
 
 from mamba_ssm.ops.triton.layer_norm import _layer_norm_fwd
 
-import selective_scan_cuda
+try:
+    import selective_scan_cuda
+except ImportError:
+    selective_scan_cuda = None
 
 
 class SelectiveScanFn(torch.autograd.Function):

diff --git a/mamba_ssm/ops/triton/mamba3/angle_dt.py b/mamba_ssm/ops/triton/mamba3/angle_dt.py
@@ -16,7 +16,7 @@
     configs=[
         triton.Config({}, num_stages=s, num_warps=w)
         for s in [1, 2, 3]
-        for w in [2, 4, 8]
+        for w in [1, 2, 4, 8]
     ],
     key=["CHUNK_SIZE", "BLOCK_D", "HAS_INIT_STATE", "RETURN_OUTPUT_STATE", "IS_VARLEN"],
 )
@@ -224,7 +224,7 @@ def angle_dt_fwd(
     configs=[
         triton.Config({}, num_stages=s, num_warps=w)
         for s in [1, 2, 3]
-        for w in [2, 4, 8]
+        for w in [1, 2, 4, 8]
     ],
     key=["CHUNK_SIZE", "BLOCK_D", "HAS_INIT_STATE", "HAS_GRAD_OUTPUT_STATE", "IS_VARLEN"],
 )

diff --git a/mamba_ssm/ops/triton/mamba3/mamba3_siso_bwd.py b/mamba_ssm/ops/triton/mamba3/mamba3_siso_bwd.py
@@ -13,19 +13,27 @@
 
 import triton
 import triton.language as tl
-from mamba_ssm.ops.triton.mamba3.utils import cos_approx, sin_approx, sigmoid_approx
+from mamba_ssm.ops.triton.mamba3.utils import (
+    cos_approx, sin_approx, sigmoid_approx,
+    _maxnreg, MAXNREG_VALUES, MAXNREG_VALUES_SMALL,
+)
 
 # =============================================================================
 # dZ Kernel
 # =============================================================================
 
 @triton.autotune(
     configs=[
-        triton.Config({"CHUNK_SIZE": cs}, num_stages=s, num_warps=w, maxnreg=r)
+        triton.Config({"CHUNK_SIZE": cs}, num_stages=s, num_warps=w, **_maxnreg(r))
         for cs in [32, 64]
         for s in [1, 2, 3]
         for w in [2, 4, 8]
-        for r in [None, 128, 256]
+        for r in MAXNREG_VALUES
+    ] + [
+        # Smaller configs for GPUs with limited register files (e.g. AMD RDNA4).
+        triton.Config({"CHUNK_SIZE": cs}, num_stages=1, num_warps=1, **_maxnreg(r))
+        for cs in [16, 32]
+        for r in MAXNREG_VALUES_SMALL
     ],
     key=["HEADDIM_V"]
 )
@@ -193,10 +201,14 @@ def grid(META):
 
 @triton.autotune(
     configs=[
-        triton.Config({}, num_stages=s, num_warps=w, maxnreg=r)
+        triton.Config({}, num_stages=s, num_warps=w, **_maxnreg(r))
         for s in [1, 2, 3]
         for w in [2, 4, 8]
-        for r in [None, 128, 256]
+        for r in MAXNREG_VALUES
+    ] + [
+        # Smaller configs for GPUs with limited register files (e.g. AMD RDNA4).
+        triton.Config({}, num_stages=1, num_warps=1, **_maxnreg(r))
+        for r in MAXNREG_VALUES_SMALL
     ],
     key=["CHUNK_SIZE", "HEADDIM_QK", "HEADDIM_V", "IS_VARLEN"]
 )
@@ -811,10 +823,14 @@ def compute_dqkv(
 
 @triton.autotune(
     configs=[
-        triton.Config({}, num_stages=s, num_warps=w, maxnreg=r)
+        triton.Config({}, num_stages=s, num_warps=w, **_maxnreg(r))
         for s in [1, 2, 3]
         for w in [2, 4, 8]
-        for r in [None, 128, 256]
+        for r in MAXNREG_VALUES
+    ] + [
+        # Smaller configs for GPUs with limited register files (e.g. AMD RDNA4).
+        triton.Config({}, num_stages=1, num_warps=1, **_maxnreg(r))
+        for r in MAXNREG_VALUES_SMALL
     ],
     key=["CHUNK_SIZE", "BLOCK_HEADDIM_QK", "HEADDIM_QK", "GQA_RATIO"]
 )
@@ -1418,11 +1434,16 @@ def apply_dk_state_post(
 # =============================================================================
 @triton.autotune(
     configs=[
-        triton.Config({"CHUNK_SIZE": cs}, num_stages=s, num_warps=w, maxnreg=r)
+        triton.Config({"CHUNK_SIZE": cs}, num_stages=s, num_warps=w, **_maxnreg(r))
         for cs in [64, 128, 256]
         for s in [1, 2, 3]
         for w in [2, 4, 8]
-        for r in [None, 128, 256]
+        for r in MAXNREG_VALUES
+    ] + [
+        # Smaller configs for GPUs with limited register files (e.g. AMD RDNA4).
+        triton.Config({"CHUNK_SIZE": cs}, num_stages=1, num_warps=1, **_maxnreg(r))
+        for cs in [32, 64]
+        for r in MAXNREG_VALUES_SMALL
     ],
     key=["HEADDIM_V", "HEADDIM_QK", "HAS_INPUT_STATE", "IS_VARLEN"]
 )

diff --git a/mamba_ssm/ops/triton/mamba3/mamba3_siso_fwd.py b/mamba_ssm/ops/triton/mamba3/mamba3_siso_fwd.py
@@ -13,17 +13,26 @@
 
 import triton
 import triton.language as tl
-from mamba_ssm.ops.triton.mamba3.utils import cos_approx, sin_approx, tanh_approx, silu, sigmoid_approx
+from mamba_ssm.ops.triton.mamba3.utils import (
+    cos_approx, sin_approx, tanh_approx, silu, sigmoid_approx,
+    _maxnreg, MAXNREG_VALUES, MAXNREG_VALUES_SMALL,
+)
 
 @triton.autotune(
     configs=[
-        triton.Config({}, num_stages=s, num_warps=w, maxnreg=r)
+        triton.Config({}, num_stages=s, num_warps=w, **_maxnreg(r))
         for s in [1, 2, 3]
         for w in [2, 4, 8]
-        for r in [None, 128, 256]
+        for r in MAXNREG_VALUES
+    ] + [
+        # Configs targeting GPUs with smaller register files (e.g. AMD RDNA4).
+        # num_warps=1 halves per-wavefront register demand; num_stages=1 avoids
+        # extra live-range overlap from software pipelining.
+        triton.Config({}, num_stages=1, num_warps=1, **_maxnreg(r))
+        for r in MAXNREG_VALUES_SMALL
     ],
     key=[
-        "CHUNK_SIZE", "HEADDIM_QK", "HEADDIM_V", "STORE_SSM_STATES_ADT_OUTV", "HAS_D", 
+        "CHUNK_SIZE", "HEADDIM_QK", "HEADDIM_V", "STORE_SSM_STATES_ADT_OUTV", "HAS_D",
         "HAS_Z", "HAS_INITIAL_STATES", "RETURN_FINAL_STATES", "IS_VARLEN"],
 )
 @triton.jit

diff --git a/mamba_ssm/ops/triton/mamba3/mamba3_siso_step.py b/mamba_ssm/ops/triton/mamba3/mamba3_siso_step.py
@@ -18,7 +18,7 @@
     configs=[
         triton.Config({}, num_stages=s, num_warps=w)
         for s in [1, 2, 3]
-        for w in [2, 4, 8]
+        for w in [1, 2, 4, 8]
     ],
     key=[
         "HEADDIM_QK", "HEADDIM_V", "HAS_D", "HAS_Z",],