From 7db8c74f4a097cc12d0ccb98b3cbea4d38ae4828 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Wed, 18 Feb 2026 18:12:49 +0100
Subject: [PATCH 1/5] feat: Added cuda version selection to uv build.

---
 README.md      |  6 +++---
 pyproject.toml | 53 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index a7d855df0..e448dfa37 100644
--- a/README.md
+++ b/README.md
@@ -44,11 +44,11 @@ It is recommended to install Modalities via uv or install PyTorch, psutil and Ni
 # Get uv (tested with uv version 0.9.13)
 curl -LsSf https://astral.sh/uv/install.sh | sh
 
-uv sync
+uv sync --extra [cpu|cu126|cu128|cu130]  # Get CUDA version via nvidia-smi
 source .venv/bin/activate
 
 # For developers: use [tests,linting] and install pre-commit hooks
-uv sync --extra tests --extra linting
+uv sync --extra [cpu|cu126|cu128|cu130] --extra tests --extra linting
 pre-commit install --install-hooks
 ```
 
@@ -60,7 +60,7 @@ conda create -n modalities python=3.13
 conda activate modalities
 
 # Install PyTorch, psutil, Ninja and Flash Attention
-pip install "torch<2.11.0"
+pip install "torch<2.11.0"  # Or appropriate version for your CUDA setup.
 pip install psutil ninja  # Ninja lowers compilation time of flash attention significantly 
 pip install flash-attn==2.8.3 --no-build-isolation
 ```
diff --git a/pyproject.toml b/pyproject.toml
index b41fddd08..f8b5d9aa2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,6 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod
 readme = "README.md"
 dependencies = [
     "numpy",
-    "torch<2.11.0",
     "ninja",
     "packaging",
     "tqdm",
@@ -33,10 +32,6 @@ dependencies = [
 Homepage = "https://github.com/Modalities/modalities"
 Issues = "https://github.com/Modalities/modalities/issues"
 
-[project.optional-dependencies]
-linting = ["pre-commit"]
-tests = ["pytest", "pytest-cov", "debugpy"]
-
 [project.scripts]
 modalities = "modalities.__main__:main"
 
@@ -44,6 +39,54 @@ modalities = "modalities.__main__:main"
 requires = ["setuptools >= 61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project.optional-dependencies]
+linting = ["pre-commit"]
+tests = ["pytest", "pytest-cov", "debugpy"]
+
+cpu = ["torch>=2.10,<2.11.0"]
+cu126 = ["torch>=2.10,<2.11.0"]
+cu128 = ["torch>=2.10,<2.11.0"]
+cu130 = ["torch>=2.10,<2.11.0"]
+
+[tool.uv]
+conflicts = [
+  [
+    { extra = "cpu" },
+    { extra = "cu126" },
+    { extra = "cu128" },
+    { extra = "cu130" },
+  ],
+]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cpu", extra = "cpu" },
+  { index = "pytorch-cu126", extra = "cu126" },
+  { index = "pytorch-cu128", extra = "cu128" },
+  { index = "pytorch-cu130", extra = "cu130" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu130"
+url = "https://download.pytorch.org/whl/cu130"
+explicit = true
+
+
 [tool.uv.extra-build-dependencies]
 flash-attn = [
     { requirement = "torch", match-runtime = true },

From af49ae3c9d10aa81f7bad896d9d7361e091ffaa5 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Thu, 19 Feb 2026 11:11:25 +0100
Subject: [PATCH 2/5] refactor: moved flash attention from dependencies into
 cuda specific optional dependencies

---
 pyproject.toml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f8b5d9aa2..42183776b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,6 @@ dependencies = [
     "matplotlib",
     "wandb",
     "einops>=0.7.0",
-    "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'",
     "debugpy",  # For VSCode debugging support
 ]
 
@@ -44,9 +43,9 @@ linting = ["pre-commit"]
 tests = ["pytest", "pytest-cov", "debugpy"]
 
 cpu = ["torch>=2.10,<2.11.0"]
-cu126 = ["torch>=2.10,<2.11.0"]
-cu128 = ["torch>=2.10,<2.11.0"]
-cu130 = ["torch>=2.10,<2.11.0"]
+cu126 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
+cu128 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
+cu130 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
 
 [tool.uv]
 conflicts = [

From 60aa4f4c14b62eca9281411c91cd9a5fd7fef655 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Thu, 19 Feb 2026 14:51:30 +0100
Subject: [PATCH 3/5] fix: Added torchvision to dependencies due to import
 error when importing PreTrainedModel.

---
 pyproject.toml | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 42183776b..97928b108 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,10 +42,22 @@ build-backend = "setuptools.build_meta"
 linting = ["pre-commit"]
 tests = ["pytest", "pytest-cov", "debugpy"]
 
-cpu = ["torch>=2.10,<2.11.0"]
-cu126 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
-cu128 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
-cu130 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"]
+cpu = ["torch>=2.10,<2.11.0", "torchvision"]
+cu126 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
+cu128 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
+cu130 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
 
 [tool.uv]
 conflicts = [

From 68123acee5d95e8c2b0c10483d6eed00d84cadb8 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Thu, 19 Feb 2026 16:50:58 +0100
Subject: [PATCH 4/5] fix: set torchvision dependency to also use the correct
 index

---
 pyproject.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 97928b108..9ea55a4d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,6 +76,12 @@ torch = [
   { index = "pytorch-cu128", extra = "cu128" },
   { index = "pytorch-cu130", extra = "cu130" },
 ]
+torchvision = [
+  { index = "pytorch-cpu", extra = "cpu" },
+  { index = "pytorch-cu126", extra = "cu126" },
+  { index = "pytorch-cu128", extra = "cu128" },
+  { index = "pytorch-cu130", extra = "cu130" },
+]
 
 [[tool.uv.index]]
 name = "pytorch-cpu"

From 11c40c32ab293facb9e76a62794b15556ad0c1f6 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Fri, 20 Feb 2026 18:19:50 +0100
Subject: [PATCH 5/5] feat(utils): Added mfu support for B200.

---
 src/modalities/utils/mfu.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/modalities/utils/mfu.py b/src/modalities/utils/mfu.py
index 76c63d122..201577702 100644
--- a/src/modalities/utils/mfu.py
+++ b/src/modalities/utils/mfu.py
@@ -14,7 +14,8 @@
 #       https://www.nvidia.com/en-us/data-center/h100/
 #
 # NOTE: These values are valid for fp16 and bf16 only
-PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12}
+# FIXME: For B200, is 2.25 PFLOPS correct or should it be 4.5 PFLOPS?
+PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12, "B200": 2.25e15}
 
 
 class MFUCalculatorABC:
@@ -130,6 +131,10 @@ def _get_theoretical_gpu_peak_performance(model_parts: FSDPX | list[FSDP2], worl
             single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single(
                 precision, "GH200"
             )
+        elif device_name.startswith("NVIDIA B200"):
+            single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single(
+                precision, "B200"
+            )
         else:
             warnings.warn(f"Could not get theoretical GPU peak performance for unknown device = {device_name}.")
             return None