From 7db8c74f4a097cc12d0ccb98b3cbea4d38ae4828 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Wed, 18 Feb 2026 18:12:49 +0100 Subject: [PATCH 1/5] feat: Added cuda version selection to uv build. --- README.md | 6 +++--- pyproject.toml | 53 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a7d855df0..e448dfa37 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ It is recommended to install Modalities via uv or install PyTorch, psutil and Ni # Get uv (tested with uv version 0.9.13) curl -LsSf https://astral.sh/uv/install.sh | sh -uv sync +uv sync --extra [cpu|cu126|cu128|cu130] # Get CUDA version via nvidia-smi source .venv/bin/activate # For developers: use [tests,linting] and install pre-commit hooks -uv sync --extra tests --extra linting +uv sync --extra [cpu|cu126|cu128|cu130] --extra tests --extra linting pre-commit install --install-hooks ``` @@ -60,7 +60,7 @@ conda create -n modalities python=3.13 conda activate modalities # Install PyTorch, psutil, Ninja and Flash Attention -pip install "torch<2.11.0" +pip install "torch<2.11.0" # Or appropriate version for your CUDA setup. pip install psutil ninja # Ninja lowers compilation time of flash attention significantly pip install flash-attn==2.8.3 --no-build-isolation ``` diff --git a/pyproject.toml b/pyproject.toml index b41fddd08..f8b5d9aa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,6 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod readme = "README.md" dependencies = [ "numpy", - "torch<2.11.0", "ninja", "packaging", "tqdm", @@ -33,10 +32,6 @@ dependencies = [ Homepage = "https://github.com/Modalities/modalities" Issues = "https://github.com/Modalities/modalities/issues" -[project.optional-dependencies] -linting = ["pre-commit"] -tests = ["pytest", "pytest-cov", "debugpy"] - [project.scripts] modalities = "modalities.__main__:main" @@ -44,6 +39,54 @@ modalities = "modalities.__main__:main" requires = ["setuptools >= 61.0.0"] build-backend = "setuptools.build_meta" +[project.optional-dependencies] +linting = ["pre-commit"] +tests = ["pytest", "pytest-cov", "debugpy"] + +cpu = ["torch>=2.10,<2.11.0"] +cu126 = ["torch>=2.10,<2.11.0"] +cu128 = ["torch>=2.10,<2.11.0"] +cu130 = ["torch>=2.10,<2.11.0"] + +[tool.uv] +conflicts = [ + [ + { extra = "cpu" }, + { extra = "cu126" }, + { extra = "cu128" }, + { extra = "cu130" }, + ], +] + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu", extra = "cpu" }, + { index = "pytorch-cu126", extra = "cu126" }, + { index = "pytorch-cu128", extra = "cu128" }, + { index = "pytorch-cu130", extra = "cu130" }, +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu126" +url = "https://download.pytorch.org/whl/cu126" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" +explicit = true + + [tool.uv.extra-build-dependencies] flash-attn = [ { requirement = "torch", match-runtime = true }, From af49ae3c9d10aa81f7bad896d9d7361e091ffaa5 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 19 Feb 2026 11:11:25 +0100 Subject: [PATCH 2/5] refactor: moved flash attention from dependencies into cuda specific optional dependencies --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f8b5d9aa2..42183776b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "matplotlib", "wandb", "einops>=0.7.0", - "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'", "debugpy", # For VSCode debugging support ] @@ -44,9 +43,9 @@ linting = ["pre-commit"] tests = ["pytest", "pytest-cov", "debugpy"] cpu = ["torch>=2.10,<2.11.0"] -cu126 = ["torch>=2.10,<2.11.0"] -cu128 = ["torch>=2.10,<2.11.0"] -cu130 = ["torch>=2.10,<2.11.0"] +cu126 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] +cu128 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] +cu130 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] [tool.uv] conflicts = [ From 60aa4f4c14b62eca9281411c91cd9a5fd7fef655 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 19 Feb 2026 14:51:30 +0100 Subject: [PATCH 3/5] fix: Added torchvision to dependencies due to import error when importing PreTrainedModel. --- pyproject.toml | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 42183776b..97928b108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,10 +42,22 @@ build-backend = "setuptools.build_meta" linting = ["pre-commit"] tests = ["pytest", "pytest-cov", "debugpy"] -cpu = ["torch>=2.10,<2.11.0"] -cu126 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] -cu128 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] -cu130 = ["torch>=2.10,<2.11.0", "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"] +cpu = ["torch>=2.10,<2.11.0", "torchvision"] +cu126 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] +cu128 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] +cu130 = [ + "torch>=2.10,<2.11.0", + "torchvision", + "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'" +] [tool.uv] conflicts = [ From 68123acee5d95e8c2b0c10483d6eed00d84cadb8 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 19 Feb 2026 16:50:58 +0100 Subject: [PATCH 4/5] fix: set torchvision dependency to also use the correct index --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 97928b108..9ea55a4d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,12 @@ torch = [ { index = "pytorch-cu128", extra = "cu128" }, { index = "pytorch-cu130", extra = "cu130" }, ] +torchvision = [ + { index = "pytorch-cpu", extra = "cpu" }, + { index = "pytorch-cu126", extra = "cu126" }, + { index = "pytorch-cu128", extra = "cu128" }, + { index = "pytorch-cu130", extra = "cu130" }, +] [[tool.uv.index]] name = "pytorch-cpu" From 11c40c32ab293facb9e76a62794b15556ad0c1f6 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 20 Feb 2026 18:19:50 +0100 Subject: [PATCH 5/5] feat(utils): Added mfu support for B200. --- src/modalities/utils/mfu.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/modalities/utils/mfu.py b/src/modalities/utils/mfu.py index 76c63d122..201577702 100644 --- a/src/modalities/utils/mfu.py +++ b/src/modalities/utils/mfu.py @@ -14,7 +14,8 @@ # https://www.nvidia.com/en-us/data-center/h100/ # # NOTE: These values are valid for fp16 and bf16 only -PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12} +# FIXME: For B200, is 2.25 PFLOPS correct or should it be 4.5 PFLOPS? +PEAK_PERFORMANCE = {"A100": 312e12, "H100": 989e12, "GH200": 989e12, "B200": 2.25e15} class MFUCalculatorABC: @@ -130,6 +131,10 @@ def _get_theoretical_gpu_peak_performance(model_parts: FSDPX | list[FSDP2], worl single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single( precision, "GH200" ) + elif device_name.startswith("NVIDIA B200"): + single_gpu_peak_performance = MFUCalculatorABC._get_theoretical_gpu_peak_performance_single( + precision, "B200" + ) else: warnings.warn(f"Could not get theoretical GPU peak performance for unknown device = {device_name}.") return None